Unlike the pre-processing of the preliminary dataset, the complete datasets are loaded (including the 'empty' droplets generated by the Chromium) to investigate the top genes present in the background of each dataset.
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
import os
sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_versions()
def remove_RB_genes(
df,
path_to_RB_genes_file = '/data/deprez_data/HCA/PeerLab_analysis/RB_genes'
):
"""Removes all columns of RB genes as listed in the RB gene file.
RB_genes_file should contain gene names, one gene name per line.
Returns RB gene-depleted df, pd.Series with number of counts removed
per cell, and a list of RB genes that were in the df."""
with open(path_to_RB_genes_file,'r') as file:
lines = file.readlines()
genes = [x.rstrip('\n') for x in lines]
RB_genes_in_df = []
df_genes = df.columns
for gene in genes:
if gene in df_genes:
RB_genes_in_df.append(gene)
# df_genes = df.columns
# ribosomal_genes = []
# for GENE in df_genes:
# if GENE[:3] in ['RPL','RPS']:
# ribosomal_genes.append(GENE)
# store the number of RB molecules per cell in a Series object with cell labels as indices
counts_removed_per_cell = pd.Series(index = df.index)
for cell in df.index:
counts_removed_per_cell[cell] = sum(df.loc[cell][RB_genes_in_df])
# now drop all columns with RB genes:
df_RB_depleted = df.drop(columns = RB_genes_in_df)
return df_RB_depleted, counts_removed_per_cell, RB_genes_in_df
os.chdir('/data/deprez_data/HCA/Data/')
outsPath = 'outs/raw_gene_bc_matrices/ucagenomix-cellranger-hg19-1.3.0/'
outsPathFilter = 'outs/filtered_gene_bc_matrices/ucagenomix-cellranger-hg19-1.3.0/'
sc.settings.set_figure_params(dpi=80)
columns = ["background", "nb_cells", 'nb_genes', 'position', 'donor', 'method']
index = ['D322_Biop_Nas1', 'D322_Biop_Pro1', 'D322_Biop_Int1',
'D326_Biop_Pro1', 'D326_Biop_Int1', 'D326_Brus_Dis1',
'D337_Brus_Dis1',
'D339_Biop_Nas1', 'D339_Biop_Pro1', 'D339_Biop_Int1', 'D339_Brus_Dis1',
'D344_Biop_Nas1', 'D344_Biop_Pro1', 'D344_Biop_Int1', 'D344_Brus_Dis1',
'D345_Biop_Nas1',
'D353_Brus_Nas1', 'D353_Biop_Pro1', 'D353_Biop_Int2', 'D353_Brus_Dis1',
'D354_Biop_Pro1', 'D354_Biop_Int2', 'D354_Brus_Dis1',
'D363_Brus_Nas1', 'D363_Biop_Pro1', 'D363_Biop_Int2', 'D363_Brus_Dis1',
'D367_Brus_Nas1', 'D367_Biop_Pro1', 'D367_Biop_Int1', 'D367_Brus_Dis1',
'D372_Brus_Nas1', 'D372_Biop_Pro1', 'D372_Biop_Int1', 'D372_Biop_Int2', 'D372_Brus_Dis1']
df_ = pd.DataFrame(index=index, columns=columns)
df_ = df_.fillna(0)
D322_Biop_Nas1 = sc.read_10x_mtx(
'./D322_Biop_Nas1/' + outsPath,
var_names='gene_symbols',
cache=True)
D322_Biop_Nas1.var_names_make_unique()
D322_Biop_Nas1.obs['manip'] = 'D322_Biop_Nas1'
D322_Biop_Nas1.obs['position'] = 'Nasal'
D322_Biop_Nas1.obs['method'] = 'Biopsy'
D322_Biop_Nas1.obs['donor'] = 'D322'
D322_Biop_Nas1.obs['name'] = ['D322_Biop_Nas1_' + s for s in list(D322_Biop_Nas1.obs.index)]
D322_Biop_Nas1.obs_names = D322_Biop_Nas1.obs['name']
D322_Biop_Nas1
filtered = sc.read_10x_mtx(
'./D322_Biop_Nas1/' + outsPathFilter,
var_names='gene_symbols',
cache=True)
minCount = min(filtered.X.sum(axis=1).A1)
D322_Biop_Nas1.obs['n_counts'] = D322_Biop_Nas1.X.sum(axis=1).A1
sc.pp.filter_cells(D322_Biop_Nas1, min_genes=0)
D322_Biop_Nas1.obs_keys
qc_df = D322_Biop_Nas1.obs[['n_genes','n_counts']]
qc_df = qc_df.sort_values(by = 'n_counts', ascending = False)
qc_df['cell_nb'] = range(0, qc_df.shape[0])
qc_df = qc_df[qc_df['n_counts'] > 0]
qc_df.shape
qc_df_subsetFull = qc_df[qc_df['n_counts'] > minCount]
print(qc_df_subsetFull.shape)
qc_df_subsetEmpty = qc_df[qc_df['n_counts'] <= 50]
qc_df_subsetEmpty = qc_df_subsetEmpty[qc_df_subsetEmpty['n_counts'] > 5]
print(qc_df_subsetEmpty.shape)
fig, ax = plt.subplots()
ax.plot(qc_df[['cell_nb']], qc_df[['n_counts']])
ax.set_xscale('log'); ax.set_yscale('log')
plt.axhline(y=50, color='r') # Limit to empty droplets
plt.axhline(y=5, color='r') # Limit to empty droplets
plt.axhline(y=minCount, color='k') # Limit to filtered dataset
plt.show()
empty = D322_Biop_Nas1[qc_df_subsetEmpty.index.tolist()]
empty.var['n_cells'] = empty.X.sum(axis=0).A1
back = sum(empty.var['n_cells'])/empty.shape[0] # Background score for the sample
#empty.var.sort_values(by = 'n_cells', ascending = False).iloc[0:50].index.tolist()
D322_Biop_Nas1 = D322_Biop_Nas1[qc_df_subsetFull.index.tolist()]
D322_Biop_Nas1.obs['background'] = back
D322_Biop_Nas1.obs['empty_droplets'] = empty.shape[0]
D322_Biop_Nas1.var['background_gene'] = empty.var['n_cells']
sc.pp.filter_cells(D322_Biop_Nas1, min_genes=0)
mito_genes = D322_Biop_Nas1.var_names.str.startswith('MT-')
D322_Biop_Nas1.obs['percent_mito'] = np.sum(
D322_Biop_Nas1[:, mito_genes].X, axis=1).A1 / np.sum(D322_Biop_Nas1.X, axis=1).A1
D322_Biop_Nas1.obs['n_counts'] = D322_Biop_Nas1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D322_Biop_Nas1.to_df())
ribo_genes = D322_Biop_Nas1.to_df().columns.isin(RB_genes_in_df)
D322_Biop_Nas1.obs['percent_ribo'] = np.sum(
D322_Biop_Nas1[:, ribo_genes].X, axis=1).A1 / np.sum(D322_Biop_Nas1.X, axis=1).A1
D322_Biop_Nas1.var['ribo_genes'] = [not i for i in ribo_genes]
sc.pl.violin(D322_Biop_Nas1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
jitter=0.4, multi_panel=True)
sc.pp.filter_cells(D322_Biop_Nas1, min_genes=500)
D322_Biop_Nas1 = D322_Biop_Nas1[D322_Biop_Nas1.obs['n_counts'] < 40000, :]
D322_Biop_Nas1 = D322_Biop_Nas1[D322_Biop_Nas1.obs['percent_mito'] < 0.2, :]
sc.pp.normalize_per_cell(D322_Biop_Nas1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D322_Biop_Nas1) # log transform the data
D322_Biop_Nas1.raw = D322_Biop_Nas1
D322_Biop_Nas1_raw = D322_Biop_Nas1 # freeze the object (for later use of the raw state of it)
D322_Biop_Nas1 = D322_Biop_Nas1[:, D322_Biop_Nas1.var['ribo_genes']]
D322_Biop_Nas1
D339_Biop_Nas1 = sc.read_10x_mtx(
'./D339_Biop_Nas1/' + outsPath,
var_names='gene_symbols',
cache=True)
D339_Biop_Nas1.var_names_make_unique()
D339_Biop_Nas1.obs['manip'] = 'D339_Biop_Nas1'
D339_Biop_Nas1.obs['position'] = 'Nasal'
D339_Biop_Nas1.obs['method'] = 'Biopsy'
D339_Biop_Nas1.obs['donor'] = 'D339'
D339_Biop_Nas1.obs['name'] = ['D339_Biop_Nas1_' + s for s in list(D339_Biop_Nas1.obs.index)]
D339_Biop_Nas1.obs_names = D339_Biop_Nas1.obs['name']
D339_Biop_Nas1
filtered = sc.read_10x_mtx(
'./D339_Biop_Nas1/' + outsPathFilter,
var_names='gene_symbols',
cache=True)
minCount = min(filtered.X.sum(axis=1).A1)
D339_Biop_Nas1.obs['n_counts'] = D339_Biop_Nas1.X.sum(axis=1).A1
sc.pp.filter_cells(D339_Biop_Nas1, min_genes=0)
D339_Biop_Nas1.obs_keys
qc_df = D339_Biop_Nas1.obs[['n_genes','n_counts']]
qc_df = qc_df.sort_values(by = 'n_counts', ascending = False)
qc_df['cell_nb'] = range(0, qc_df.shape[0])
qc_df = qc_df[qc_df['n_counts'] > 0]
qc_df.shape
qc_df_subsetFull = qc_df[qc_df['n_counts'] > minCount]
print(qc_df_subsetFull.shape)
qc_df_subsetEmpty = qc_df[qc_df['n_counts'] <= 50]
qc_df_subsetEmpty = qc_df_subsetEmpty[qc_df_subsetEmpty['n_counts'] > 5]
print(qc_df_subsetEmpty.shape)
fig, ax = plt.subplots()
ax.plot(qc_df[['cell_nb']], qc_df[['n_counts']])
ax.set_xscale('log'); ax.set_yscale('log')
plt.axhline(y=10, color='r') # Limit to empty droplets
plt.axhline(y=minCount, color='k') # Limit to filtered dataset
plt.show()
empty = D339_Biop_Nas1[qc_df_subsetEmpty.index.tolist()]
empty.var['n_cells'] = empty.X.sum(axis=0).A1
back = sum(empty.var['n_cells'])/empty.shape[0] # Background score for the sample
#empty.var.sort_values(by = 'n_cells', ascending = False).iloc[0:50].index.tolist()
D339_Biop_Nas1 = D339_Biop_Nas1[qc_df_subsetFull.index.tolist()]
D339_Biop_Nas1.obs['background'] = back
D339_Biop_Nas1.obs['empty_droplets'] = empty.shape[0]
D339_Biop_Nas1.var['background_gene'] = empty.var['n_cells']
sc.pp.filter_cells(D339_Biop_Nas1, min_genes=0)
mito_genes = D339_Biop_Nas1.var_names.str.startswith('MT-')
D339_Biop_Nas1.obs['percent_mito'] = np.sum(
D339_Biop_Nas1[:, mito_genes].X, axis=1).A1 / np.sum(D339_Biop_Nas1.X, axis=1).A1
D339_Biop_Nas1.obs['n_counts'] = D339_Biop_Nas1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D339_Biop_Nas1.to_df())
ribo_genes = D339_Biop_Nas1.to_df().columns.isin(RB_genes_in_df)
D339_Biop_Nas1.obs['percent_ribo'] = np.sum(
D339_Biop_Nas1[:, ribo_genes].X, axis=1).A1 / np.sum(D339_Biop_Nas1.X, axis=1).A1
D339_Biop_Nas1.var['ribo_genes'] = [not i for i in ribo_genes]
sc.pl.violin(D339_Biop_Nas1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
jitter=0.4, multi_panel=True)
sc.pp.filter_cells(D339_Biop_Nas1, min_genes=500)
D339_Biop_Nas1 = D339_Biop_Nas1[D339_Biop_Nas1.obs['n_counts'] < 40000, :]
D339_Biop_Nas1 = D339_Biop_Nas1[D339_Biop_Nas1.obs['percent_mito'] < 0.15, :]
sc.pp.normalize_per_cell(D339_Biop_Nas1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D339_Biop_Nas1) # log transform the data
D339_Biop_Nas1.raw = D339_Biop_Nas1
D339_Biop_Nas1_raw = D339_Biop_Nas1 # freeze the object (for later use of the raw state of it)
D339_Biop_Nas1 = D339_Biop_Nas1[:, D339_Biop_Nas1.var['ribo_genes']]
D339_Biop_Nas1
D344_Biop_Nas1 = sc.read_10x_mtx(
'./D344_Biop_Nas1/' + outsPath,
var_names='gene_symbols',
cache=True)
D344_Biop_Nas1.var_names_make_unique()
D344_Biop_Nas1.obs['manip'] = 'D344_Biop_Nas1'
D344_Biop_Nas1.obs['position'] = 'Nasal'
D344_Biop_Nas1.obs['method'] = 'Biopsy'
D344_Biop_Nas1.obs['donor'] = 'D344'
D344_Biop_Nas1.obs['name'] = ['D344_Biop_Nas1_' + s for s in list(D344_Biop_Nas1.obs.index)]
D344_Biop_Nas1.obs_names = D344_Biop_Nas1.obs['name']
D344_Biop_Nas1
filtered = sc.read_10x_mtx(
'./D344_Biop_Nas1/' + outsPathFilter,
var_names='gene_symbols',
cache=True)
minCount = min(filtered.X.sum(axis=1).A1)
D344_Biop_Nas1.obs['n_counts'] = D344_Biop_Nas1.X.sum(axis=1).A1
sc.pp.filter_cells(D344_Biop_Nas1, min_genes=0)
D344_Biop_Nas1.obs_keys
qc_df = D344_Biop_Nas1.obs[['n_genes','n_counts']]
qc_df = qc_df.sort_values(by = 'n_counts', ascending = False)
qc_df['cell_nb'] = range(0, qc_df.shape[0])
qc_df = qc_df[qc_df['n_counts'] > 0]
qc_df.shape
qc_df_subsetFull = qc_df[qc_df['n_counts'] > minCount]
print(qc_df_subsetFull.shape)
qc_df_subsetEmpty = qc_df[qc_df['n_counts'] <= 50]
qc_df_subsetEmpty = qc_df_subsetEmpty[qc_df_subsetEmpty['n_counts'] > 5]
print(qc_df_subsetEmpty.shape)
fig, ax = plt.subplots()
ax.plot(qc_df[['cell_nb']], qc_df[['n_counts']])
ax.set_xscale('log'); ax.set_yscale('log')
plt.axhline(y=10, color='r') # Limit to empty droplets
plt.axhline(y=minCount, color='k') # Limit to filtered dataset
plt.show()
empty = D344_Biop_Nas1[qc_df_subsetEmpty.index.tolist()]
empty.var['n_cells'] = empty.X.sum(axis=0).A1
back = sum(empty.var['n_cells'])/empty.shape[0] # Background score for the sample
#empty.var.sort_values(by = 'n_cells', ascending = False).iloc[0:50].index.tolist()
D344_Biop_Nas1 = D344_Biop_Nas1[qc_df_subsetFull.index.tolist()]
D344_Biop_Nas1.obs['background'] = back
D344_Biop_Nas1.obs['empty_droplets'] = empty.shape[0]
D344_Biop_Nas1.var['background_gene'] = empty.var['n_cells']
sc.pp.filter_cells(D344_Biop_Nas1, min_genes=0)
mito_genes = D344_Biop_Nas1.var_names.str.startswith('MT-')
D344_Biop_Nas1.obs['percent_mito'] = np.sum(
D344_Biop_Nas1[:, mito_genes].X, axis=1).A1 / np.sum(D344_Biop_Nas1.X, axis=1).A1
D344_Biop_Nas1.obs['n_counts'] = D344_Biop_Nas1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D344_Biop_Nas1.to_df())
ribo_genes = D344_Biop_Nas1.to_df().columns.isin(RB_genes_in_df)
D344_Biop_Nas1.obs['percent_ribo'] = np.sum(
D344_Biop_Nas1[:, ribo_genes].X, axis=1).A1 / np.sum(D344_Biop_Nas1.X, axis=1).A1
D344_Biop_Nas1.var['ribo_genes'] = [not i for i in ribo_genes]
sc.pl.violin(D344_Biop_Nas1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
jitter=0.4, multi_panel=True)
sc.pp.filter_cells(D344_Biop_Nas1, min_genes=500)
D344_Biop_Nas1 = D344_Biop_Nas1[D344_Biop_Nas1.obs['n_counts'] < 50000, :]
D344_Biop_Nas1 = D344_Biop_Nas1[D344_Biop_Nas1.obs['percent_mito'] < 0.1, :]
sc.pp.normalize_per_cell(D344_Biop_Nas1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D344_Biop_Nas1) # log transform the data
D344_Biop_Nas1.raw = D344_Biop_Nas1
D344_Biop_Nas1_raw = D344_Biop_Nas1 # freeze the object (for later use of the raw state of it)
D344_Biop_Nas1 = D344_Biop_Nas1[:, D344_Biop_Nas1.var['ribo_genes']]
D344_Biop_Nas1
D345_Biop_Nas1 = sc.read_10x_mtx(
'./D345_Biop_Nas1/' + outsPath,
var_names='gene_symbols',
cache=True)
D345_Biop_Nas1.var_names_make_unique()
D345_Biop_Nas1.obs['manip'] = 'D345_Biop_Nas1'
D345_Biop_Nas1.obs['position'] = 'Nasal'
D345_Biop_Nas1.obs['method'] = 'Biopsy'
D345_Biop_Nas1.obs['donor'] = 'D345'
D345_Biop_Nas1.obs['name'] = ['D345_Biop_Nas1_' + s for s in list(D345_Biop_Nas1.obs.index)]
D345_Biop_Nas1.obs_names = D345_Biop_Nas1.obs['name']
D345_Biop_Nas1
filtered = sc.read_10x_mtx(
'./D345_Biop_Nas1/' + outsPathFilter,
var_names='gene_symbols',
cache=True)
minCount = min(filtered.X.sum(axis=1).A1)
D345_Biop_Nas1.obs['n_counts'] = D345_Biop_Nas1.X.sum(axis=1).A1
sc.pp.filter_cells(D345_Biop_Nas1, min_genes=0)
D345_Biop_Nas1.obs_keys
qc_df = D345_Biop_Nas1.obs[['n_genes','n_counts']]
qc_df = qc_df.sort_values(by = 'n_counts', ascending = False)
qc_df['cell_nb'] = range(0, qc_df.shape[0])
qc_df = qc_df[qc_df['n_counts'] > 0]
qc_df.shape
qc_df_subsetFull = qc_df[qc_df['n_counts'] > minCount]
print(qc_df_subsetFull.shape)
qc_df_subsetEmpty = qc_df[qc_df['n_counts'] <= 50]
qc_df_subsetEmpty = qc_df_subsetEmpty[qc_df_subsetEmpty['n_counts'] > 5]
print(qc_df_subsetEmpty.shape)
fig, ax = plt.subplots()
ax.plot(qc_df[['cell_nb']], qc_df[['n_counts']])
ax.set_xscale('log'); ax.set_yscale('log')
plt.axhline(y=10, color='r') # Limit to empty droplets
plt.axhline(y=minCount, color='k') # Limit to filtered dataset
plt.show()
empty = D345_Biop_Nas1[qc_df_subsetEmpty.index.tolist()]
empty.var['n_cells'] = empty.X.sum(axis=0).A1
back = sum(empty.var['n_cells'])/empty.shape[0] # Background score for the sample
#empty.var.sort_values(by = 'n_cells', ascending = False).iloc[0:50].index.tolist()
D345_Biop_Nas1 = D345_Biop_Nas1[qc_df_subsetFull.index.tolist()]
D345_Biop_Nas1.obs['background'] = back
D345_Biop_Nas1.obs['empty_droplets'] = empty.shape[0]
D345_Biop_Nas1.var['background_gene'] = empty.var['n_cells']
sc.pp.filter_cells(D345_Biop_Nas1, min_genes=0)
mito_genes = D345_Biop_Nas1.var_names.str.startswith('MT-')
D345_Biop_Nas1.obs['percent_mito'] = np.sum(
D345_Biop_Nas1[:, mito_genes].X, axis=1).A1 / np.sum(D345_Biop_Nas1.X, axis=1).A1
D345_Biop_Nas1.obs['n_counts'] = D345_Biop_Nas1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D345_Biop_Nas1.to_df())
ribo_genes = D345_Biop_Nas1.to_df().columns.isin(RB_genes_in_df)
D345_Biop_Nas1.obs['percent_ribo'] = np.sum(
D345_Biop_Nas1[:, ribo_genes].X, axis=1).A1 / np.sum(D345_Biop_Nas1.X, axis=1).A1
D345_Biop_Nas1.var['ribo_genes'] = [not i for i in ribo_genes]
sc.pl.violin(D345_Biop_Nas1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
jitter=0.4, multi_panel=True)
sc.pp.filter_cells(D345_Biop_Nas1, min_genes=500)
D345_Biop_Nas1 = D345_Biop_Nas1[D345_Biop_Nas1.obs['n_counts'] < 20000, :]
D345_Biop_Nas1 = D345_Biop_Nas1[D345_Biop_Nas1.obs['percent_mito'] < 0.2, :]
sc.pp.normalize_per_cell(D345_Biop_Nas1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D345_Biop_Nas1) # log transform the data
D345_Biop_Nas1.raw = D345_Biop_Nas1
D345_Biop_Nas1_raw = D345_Biop_Nas1 # freeze the object (for later use of the raw state of it)
D345_Biop_Nas1 = D345_Biop_Nas1[:, D345_Biop_Nas1.var['ribo_genes']]
D345_Biop_Nas1
D353_Brus_Nas1 = sc.read_10x_mtx(
'./D353_Brus_Nas1/' + outsPath,
var_names='gene_symbols',
cache=True)
D353_Brus_Nas1.var_names_make_unique()
D353_Brus_Nas1.obs['manip'] = 'D353_Brus_Nas1'
D353_Brus_Nas1.obs['position'] = 'Nasal'
D353_Brus_Nas1.obs['method'] = 'Brushing'
D353_Brus_Nas1.obs['donor'] = 'D353'
D353_Brus_Nas1.obs['name'] = ['D353_Brus_Nas1_' + s for s in list(D353_Brus_Nas1.obs.index)]
D353_Brus_Nas1.obs_names = D353_Brus_Nas1.obs['name']
D353_Brus_Nas1
filtered = sc.read_10x_mtx(
'./D353_Brus_Nas1/' + outsPathFilter,
var_names='gene_symbols',
cache=True)
minCount = min(filtered.X.sum(axis=1).A1)
D353_Brus_Nas1.obs['n_counts'] = D353_Brus_Nas1.X.sum(axis=1).A1
sc.pp.filter_cells(D353_Brus_Nas1, min_genes=0)
D353_Brus_Nas1.obs_keys
qc_df = D353_Brus_Nas1.obs[['n_genes','n_counts']]
qc_df = qc_df.sort_values(by = 'n_counts', ascending = False)
qc_df['cell_nb'] = range(0, qc_df.shape[0])
qc_df = qc_df[qc_df['n_counts'] > 0]
qc_df.shape
qc_df_subsetFull = qc_df[qc_df['n_counts'] > minCount]
print(qc_df_subsetFull.shape)
qc_df_subsetEmpty = qc_df[qc_df['n_counts'] <= 50]
qc_df_subsetEmpty = qc_df_subsetEmpty[qc_df_subsetEmpty['n_counts'] > 5]
print(qc_df_subsetEmpty.shape)
fig, ax = plt.subplots()
ax.plot(qc_df[['cell_nb']], qc_df[['n_counts']])
ax.set_xscale('log'); ax.set_yscale('log')
plt.axhline(y=10, color='r') # Limit to empty droplets
plt.axhline(y=minCount, color='k') # Limit to filtered dataset
plt.show()
empty = D353_Brus_Nas1[qc_df_subsetEmpty.index.tolist()]
empty.var['n_cells'] = empty.X.sum(axis=0).A1
back = sum(empty.var['n_cells'])/empty.shape[0] # Background score for the sample
#empty.var.sort_values(by = 'n_cells', ascending = False).iloc[0:50].index.tolist()
D353_Brus_Nas1 = D353_Brus_Nas1[qc_df_subsetFull.index.tolist()]
D353_Brus_Nas1.obs['background'] = back
D353_Brus_Nas1.obs['empty_droplets'] = empty.shape[0]
D353_Brus_Nas1.var['background_gene'] = empty.var['n_cells']
sc.pp.filter_cells(D353_Brus_Nas1, min_genes=0)
mito_genes = D353_Brus_Nas1.var_names.str.startswith('MT-')
D353_Brus_Nas1.obs['percent_mito'] = np.sum(
D353_Brus_Nas1[:, mito_genes].X, axis=1).A1 / np.sum(D353_Brus_Nas1.X, axis=1).A1
D353_Brus_Nas1.obs['n_counts'] = D353_Brus_Nas1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D353_Brus_Nas1.to_df())
ribo_genes = D353_Brus_Nas1.to_df().columns.isin(RB_genes_in_df)
D353_Brus_Nas1.obs['percent_ribo'] = np.sum(
D353_Brus_Nas1[:, ribo_genes].X, axis=1).A1 / np.sum(D353_Brus_Nas1.X, axis=1).A1
D353_Brus_Nas1.var['ribo_genes'] = [not i for i in ribo_genes]
sc.pl.violin(D353_Brus_Nas1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
jitter=0.4, multi_panel=True)
sc.pp.filter_cells(D353_Brus_Nas1, min_genes=500)
D353_Brus_Nas1 = D353_Brus_Nas1[D353_Brus_Nas1.obs['n_counts'] < 40000, :]
D353_Brus_Nas1 = D353_Brus_Nas1[D353_Brus_Nas1.obs['percent_mito'] < 0.5, :]
sc.pp.normalize_per_cell(D353_Brus_Nas1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D353_Brus_Nas1) # log transform the data
D353_Brus_Nas1.raw = D353_Brus_Nas1
D353_Brus_Nas1_raw = D353_Brus_Nas1 # freeze the object (for later use of the raw state of it)
D353_Brus_Nas1 = D353_Brus_Nas1[:, D353_Brus_Nas1.var['ribo_genes']]
D353_Brus_Nas1
D363_Brus_Nas1 = sc.read_10x_mtx(
'./D363_Brus_Nas1/' + outsPath,
var_names='gene_symbols',
cache=True)
D363_Brus_Nas1.var_names_make_unique()
D363_Brus_Nas1.obs['manip'] = 'D363_Brus_Nas1'
D363_Brus_Nas1.obs['position'] = 'Nasal'
D363_Brus_Nas1.obs['method'] = 'Brushing'
D363_Brus_Nas1.obs['donor'] = 'D363'
D363_Brus_Nas1.obs['name'] = ['D363_Brus_Nas1_' + s for s in list(D363_Brus_Nas1.obs.index)]
D363_Brus_Nas1.obs_names = D363_Brus_Nas1.obs['name']
D363_Brus_Nas1
filtered = sc.read_10x_mtx(
'./D363_Brus_Nas1/' + outsPathFilter,
var_names='gene_symbols',
cache=True)
minCount = min(filtered.X.sum(axis=1).A1)
D363_Brus_Nas1.obs['n_counts'] = D363_Brus_Nas1.X.sum(axis=1).A1
sc.pp.filter_cells(D363_Brus_Nas1, min_genes=0)
D363_Brus_Nas1.obs_keys
qc_df = D363_Brus_Nas1.obs[['n_genes','n_counts']]
qc_df = qc_df.sort_values(by = 'n_counts', ascending = False)
qc_df['cell_nb'] = range(0, qc_df.shape[0])
qc_df = qc_df[qc_df['n_counts'] > 0]
qc_df.shape
qc_df_subsetFull = qc_df[qc_df['n_counts'] > minCount]
print(qc_df_subsetFull.shape)
qc_df_subsetEmpty = qc_df[qc_df['n_counts'] <= 50]
qc_df_subsetEmpty = qc_df_subsetEmpty[qc_df_subsetEmpty['n_counts'] > 5]
print(qc_df_subsetEmpty.shape)
fig, ax = plt.subplots()
ax.plot(qc_df[['cell_nb']], qc_df[['n_counts']])
ax.set_xscale('log'); ax.set_yscale('log')
plt.axhline(y=10, color='r') # Limit to empty droplets
plt.axhline(y=minCount, color='k') # Limit to filtered dataset
plt.show()
empty = D363_Brus_Nas1[qc_df_subsetEmpty.index.tolist()]
empty.var['n_cells'] = empty.X.sum(axis=0).A1
back = sum(empty.var['n_cells'])/empty.shape[0] # Background score for the sample
#empty.var.sort_values(by = 'n_cells', ascending = False).iloc[0:50].index.tolist()
D363_Brus_Nas1 = D363_Brus_Nas1[qc_df_subsetFull.index.tolist()]
D363_Brus_Nas1.obs['background'] = back
D363_Brus_Nas1.obs['empty_droplets'] = empty.shape[0]
D363_Brus_Nas1.var['background_gene'] = empty.var['n_cells']
sc.pp.filter_cells(D363_Brus_Nas1, min_genes=0)
mito_genes = D363_Brus_Nas1.var_names.str.startswith('MT-')
D363_Brus_Nas1.obs['percent_mito'] = np.sum(
D363_Brus_Nas1[:, mito_genes].X, axis=1).A1 / np.sum(D363_Brus_Nas1.X, axis=1).A1
D363_Brus_Nas1.obs['n_counts'] = D363_Brus_Nas1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D363_Brus_Nas1.to_df())
ribo_genes = D363_Brus_Nas1.to_df().columns.isin(RB_genes_in_df)
D363_Brus_Nas1.obs['percent_ribo'] = np.sum(
D363_Brus_Nas1[:, ribo_genes].X, axis=1).A1 / np.sum(D363_Brus_Nas1.X, axis=1).A1
D363_Brus_Nas1.var['ribo_genes'] = [not i for i in ribo_genes]
sc.pl.violin(D363_Brus_Nas1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
jitter=0.4, multi_panel=True)
sc.pp.filter_cells(D363_Brus_Nas1, min_genes=500)
D363_Brus_Nas1 = D363_Brus_Nas1[D363_Brus_Nas1.obs['n_counts'] < 30000, :]
D363_Brus_Nas1 = D363_Brus_Nas1[D363_Brus_Nas1.obs['percent_mito'] < 0.5, :]
sc.pp.normalize_per_cell(D363_Brus_Nas1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D363_Brus_Nas1) # log transform the data
D363_Brus_Nas1.raw = D363_Brus_Nas1
D363_Brus_Nas1_raw = D363_Brus_Nas1 # freeze the object (for later use of the raw state of it)
D363_Brus_Nas1 = D363_Brus_Nas1[:, D363_Brus_Nas1.var['ribo_genes']]
D363_Brus_Nas1
D367_Brus_Nas1 = sc.read_10x_mtx(
'./D367_Brus_Nas1/' + outsPath,
var_names='gene_symbols',
cache=True)
D367_Brus_Nas1.var_names_make_unique()
D367_Brus_Nas1.obs['manip'] = 'D367_Brus_Nas1'
D367_Brus_Nas1.obs['position'] = 'Nasal'
D367_Brus_Nas1.obs['method'] = 'Brushing'
D367_Brus_Nas1.obs['donor'] = 'D367'
D367_Brus_Nas1.obs['name'] = ['D367_Brus_Nas1_' + s for s in list(D367_Brus_Nas1.obs.index)]
D367_Brus_Nas1.obs_names = D367_Brus_Nas1.obs['name']
D367_Brus_Nas1
filtered = sc.read_10x_mtx(
'./D367_Brus_Nas1/' + outsPathFilter,
var_names='gene_symbols',
cache=True)
minCount = min(filtered.X.sum(axis=1).A1)
D367_Brus_Nas1.obs['n_counts'] = D367_Brus_Nas1.X.sum(axis=1).A1
sc.pp.filter_cells(D367_Brus_Nas1, min_genes=0)
D367_Brus_Nas1.obs_keys
qc_df = D367_Brus_Nas1.obs[['n_genes','n_counts']]
qc_df = qc_df.sort_values(by = 'n_counts', ascending = False)
qc_df['cell_nb'] = range(0, qc_df.shape[0])
qc_df = qc_df[qc_df['n_counts'] > 0]
qc_df.shape
qc_df_subsetFull = qc_df[qc_df['n_counts'] > minCount]
print(qc_df_subsetFull.shape)
qc_df_subsetEmpty = qc_df[qc_df['n_counts'] <= 50]
qc_df_subsetEmpty = qc_df_subsetEmpty[qc_df_subsetEmpty['n_counts'] > 5]
print(qc_df_subsetEmpty.shape)
fig, ax = plt.subplots()
ax.plot(qc_df[['cell_nb']], qc_df[['n_counts']])
ax.set_xscale('log'); ax.set_yscale('log')
plt.axhline(y=10, color='r') # Limit to empty droplets
plt.axhline(y=minCount, color='k') # Limit to filtered dataset
plt.show()
empty = D367_Brus_Nas1[qc_df_subsetEmpty.index.tolist()]
empty.var['n_cells'] = empty.X.sum(axis=0).A1
back = sum(empty.var['n_cells'])/empty.shape[0] # Background score for the sample
#empty.var.sort_values(by = 'n_cells', ascending = False).iloc[0:50].index.tolist()
D367_Brus_Nas1 = D367_Brus_Nas1[qc_df_subsetFull.index.tolist()]
D367_Brus_Nas1.obs['background'] = back
D367_Brus_Nas1.obs['empty_droplets'] = empty.shape[0]
D367_Brus_Nas1.var['background_gene'] = empty.var['n_cells']
sc.pp.filter_cells(D367_Brus_Nas1, min_genes=0)
mito_genes = D367_Brus_Nas1.var_names.str.startswith('MT-')
D367_Brus_Nas1.obs['percent_mito'] = np.sum(
D367_Brus_Nas1[:, mito_genes].X, axis=1).A1 / np.sum(D367_Brus_Nas1.X, axis=1).A1
D367_Brus_Nas1.obs['n_counts'] = D367_Brus_Nas1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D367_Brus_Nas1.to_df())
ribo_genes = D367_Brus_Nas1.to_df().columns.isin(RB_genes_in_df)
D367_Brus_Nas1.obs['percent_ribo'] = np.sum(
D367_Brus_Nas1[:, ribo_genes].X, axis=1).A1 / np.sum(D367_Brus_Nas1.X, axis=1).A1
D367_Brus_Nas1.var['ribo_genes'] = [not i for i in ribo_genes]
sc.pl.violin(D367_Brus_Nas1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
jitter=0.4, multi_panel=True)
sc.pp.filter_cells(D367_Brus_Nas1, min_genes=500)
D367_Brus_Nas1 = D367_Brus_Nas1[D367_Brus_Nas1.obs['n_counts'] < 30000, :]
D367_Brus_Nas1 = D367_Brus_Nas1[D367_Brus_Nas1.obs['percent_mito'] < 0.5, :]
sc.pp.normalize_per_cell(D367_Brus_Nas1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D367_Brus_Nas1) # log transform the data
D367_Brus_Nas1.raw = D367_Brus_Nas1
D367_Brus_Nas1_raw = D367_Brus_Nas1 # freeze the object (for later use of the raw state of it)
D367_Brus_Nas1 = D367_Brus_Nas1[:, D367_Brus_Nas1.var['ribo_genes']]
D367_Brus_Nas1
D372_Brus_Nas1 = sc.read_10x_mtx(
'./D372_Brus_Nas1/' + outsPath,
var_names='gene_symbols',
cache=True)
D372_Brus_Nas1.var_names_make_unique()
D372_Brus_Nas1.obs['manip'] = 'D372_Brus_Nas1'
D372_Brus_Nas1.obs['position'] = 'Nasal'
D372_Brus_Nas1.obs['method'] = 'Brushing'
D372_Brus_Nas1.obs['donor'] = 'D372'
D372_Brus_Nas1.obs['name'] = ['D372_Brus_Nas1_' + s for s in list(D372_Brus_Nas1.obs.index)]
D372_Brus_Nas1.obs_names = D372_Brus_Nas1.obs['name']
D372_Brus_Nas1
filtered = sc.read_10x_mtx(
'./D372_Brus_Nas1/' + outsPathFilter,
var_names='gene_symbols',
cache=True)
minCount = min(filtered.X.sum(axis=1).A1)
D372_Brus_Nas1.obs['n_counts'] = D372_Brus_Nas1.X.sum(axis=1).A1
sc.pp.filter_cells(D372_Brus_Nas1, min_genes=0)
D372_Brus_Nas1.obs_keys
qc_df = D372_Brus_Nas1.obs[['n_genes','n_counts']]
qc_df = qc_df.sort_values(by = 'n_counts', ascending = False)
qc_df['cell_nb'] = range(0, qc_df.shape[0])
qc_df = qc_df[qc_df['n_counts'] > 0]
qc_df.shape
qc_df_subsetFull = qc_df[qc_df['n_counts'] > minCount]
print(qc_df_subsetFull.shape)
qc_df_subsetEmpty = qc_df[qc_df['n_counts'] <= 50]
qc_df_subsetEmpty = qc_df_subsetEmpty[qc_df_subsetEmpty['n_counts'] > 5]
print(qc_df_subsetEmpty.shape)
fig, ax = plt.subplots()
ax.plot(qc_df[['cell_nb']], qc_df[['n_counts']])
ax.set_xscale('log'); ax.set_yscale('log')
plt.axhline(y=10, color='r') # Limit to empty droplets
plt.axhline(y=minCount, color='k') # Limit to filtered dataset
plt.show()
empty = D372_Brus_Nas1[qc_df_subsetEmpty.index.tolist()]
empty.var['n_cells'] = empty.X.sum(axis=0).A1
back = sum(empty.var['n_cells'])/empty.shape[0] # Background score for the sample
#empty.var.sort_values(by = 'n_cells', ascending = False).iloc[0:50].index.tolist()
D372_Brus_Nas1 = D372_Brus_Nas1[qc_df_subsetFull.index.tolist()]
D372_Brus_Nas1.obs['background'] = back
D372_Brus_Nas1.obs['empty_droplets'] = empty.shape[0]
D372_Brus_Nas1.var['background_gene'] = empty.var['n_cells']
sc.pp.filter_cells(D372_Brus_Nas1, min_genes=0)
mito_genes = D372_Brus_Nas1.var_names.str.startswith('MT-')
D372_Brus_Nas1.obs['percent_mito'] = np.sum(
D372_Brus_Nas1[:, mito_genes].X, axis=1).A1 / np.sum(D372_Brus_Nas1.X, axis=1).A1
D372_Brus_Nas1.obs['n_counts'] = D372_Brus_Nas1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D372_Brus_Nas1.to_df())
ribo_genes = D372_Brus_Nas1.to_df().columns.isin(RB_genes_in_df)
D372_Brus_Nas1.obs['percent_ribo'] = np.sum(
D372_Brus_Nas1[:, ribo_genes].X, axis=1).A1 / np.sum(D372_Brus_Nas1.X, axis=1).A1
D372_Brus_Nas1.var['ribo_genes'] = [not i for i in ribo_genes]
sc.pl.violin(D372_Brus_Nas1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
jitter=0.4, multi_panel=True)
sc.pp.filter_cells(D372_Brus_Nas1, min_genes=500)
D372_Brus_Nas1 = D372_Brus_Nas1[D372_Brus_Nas1.obs['n_counts'] < 40000, :]
D372_Brus_Nas1 = D372_Brus_Nas1[D372_Brus_Nas1.obs['percent_mito'] < 0.5, :]
sc.pp.normalize_per_cell(D372_Brus_Nas1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D372_Brus_Nas1) # log transform the data
D372_Brus_Nas1.raw = D372_Brus_Nas1
D372_Brus_Nas1_raw = D372_Brus_Nas1 # freeze the object (for later use of the raw state of it)
D372_Brus_Nas1 = D372_Brus_Nas1[:, D372_Brus_Nas1.var['ribo_genes']]
D372_Brus_Nas1
D322_Biop_Pro1 = sc.read_10x_mtx(
'./D322_Biop_Pro1/' + outsPath,
var_names='gene_symbols',
cache=True)
D322_Biop_Pro1.var_names_make_unique()
D322_Biop_Pro1.obs['manip'] = 'D322_Biop_Pro1'
D322_Biop_Pro1.obs['position'] = 'Proximal'
D322_Biop_Pro1.obs['method'] = 'Biopsy'
D322_Biop_Pro1.obs['donor'] = 'D322'
D322_Biop_Pro1.obs['name'] = ['D322_Biop_Pro1_' + s for s in list(D322_Biop_Pro1.obs.index)]
D322_Biop_Pro1.obs_names = D322_Biop_Pro1.obs['name']
D322_Biop_Pro1
filtered = sc.read_10x_mtx(
'./D322_Biop_Pro1/' + outsPathFilter,
var_names='gene_symbols',
cache=True)
minCount = min(filtered.X.sum(axis=1).A1)
D322_Biop_Pro1.obs['n_counts'] = D322_Biop_Pro1.X.sum(axis=1).A1
sc.pp.filter_cells(D322_Biop_Pro1, min_genes=0)
D322_Biop_Pro1.obs_keys
qc_df = D322_Biop_Pro1.obs[['n_genes','n_counts']]
qc_df = qc_df.sort_values(by = 'n_counts', ascending = False)
qc_df['cell_nb'] = range(0, qc_df.shape[0])
qc_df = qc_df[qc_df['n_counts'] > 0]
qc_df.shape
qc_df_subsetFull = qc_df[qc_df['n_counts'] > minCount]
print(qc_df_subsetFull.shape)
qc_df_subsetEmpty = qc_df[qc_df['n_counts'] <= 50]
qc_df_subsetEmpty = qc_df_subsetEmpty[qc_df_subsetEmpty['n_counts'] > 5]
print(qc_df_subsetEmpty.shape)
fig, ax = plt.subplots()
ax.plot(qc_df[['cell_nb']], qc_df[['n_counts']])
ax.set_xscale('log'); ax.set_yscale('log')
plt.axhline(y=10, color='r') # Limit to empty droplets
plt.axhline(y=minCount, color='k') # Limit to filtered dataset
plt.show()
empty = D322_Biop_Pro1[qc_df_subsetEmpty.index.tolist()]
empty.var['n_cells'] = empty.X.sum(axis=0).A1
back = sum(empty.var['n_cells'])/empty.shape[0] # Background score for the sample
#empty.var.sort_values(by = 'n_cells', ascending = False).iloc[0:50].index.tolist()
D322_Biop_Pro1 = D322_Biop_Pro1[qc_df_subsetFull.index.tolist()]
D322_Biop_Pro1.obs['background'] = back
D322_Biop_Pro1.obs['empty_droplets'] = empty.shape[0]
D322_Biop_Pro1.var['background_gene'] = empty.var['n_cells']
sc.pp.filter_cells(D322_Biop_Pro1, min_genes=0)
mito_genes = D322_Biop_Pro1.var_names.str.startswith('MT-')
D322_Biop_Pro1.obs['percent_mito'] = np.sum(
D322_Biop_Pro1[:, mito_genes].X, axis=1).A1 / np.sum(D322_Biop_Pro1.X, axis=1).A1
D322_Biop_Pro1.obs['n_counts'] = D322_Biop_Pro1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D322_Biop_Pro1.to_df())
ribo_genes = D322_Biop_Pro1.to_df().columns.isin(RB_genes_in_df)
D322_Biop_Pro1.obs['percent_ribo'] = np.sum(
D322_Biop_Pro1[:, ribo_genes].X, axis=1).A1 / np.sum(D322_Biop_Pro1.X, axis=1).A1
D322_Biop_Pro1.var['ribo_genes'] = [not i for i in ribo_genes]
sc.pl.violin(D322_Biop_Pro1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
jitter=0.4, multi_panel=True)
sc.pp.filter_cells(D322_Biop_Pro1, min_genes=500)
D322_Biop_Pro1 = D322_Biop_Pro1[D322_Biop_Pro1.obs['n_counts'] < 20000, :]
D322_Biop_Pro1 = D322_Biop_Pro1[D322_Biop_Pro1.obs['percent_mito'] < 0.3, :]
sc.pp.normalize_per_cell(D322_Biop_Pro1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D322_Biop_Pro1) # log transform the data
D322_Biop_Pro1.raw = D322_Biop_Pro1
D322_Biop_Pro1_raw = D322_Biop_Pro1 # freeze the object (for later use of the raw state of it)
D322_Biop_Pro1 = D322_Biop_Pro1[:, D322_Biop_Pro1.var['ribo_genes']]
D322_Biop_Pro1
D326_Biop_Pro1 = sc.read_10x_mtx(
'./D326_Biop_Pro1/' + outsPath,
var_names='gene_symbols',
cache=True)
D326_Biop_Pro1.var_names_make_unique()
D326_Biop_Pro1.obs['manip'] = 'D326_Biop_Pro1'
D326_Biop_Pro1.obs['position'] = 'Proximal'
D326_Biop_Pro1.obs['method'] = 'Biopsy'
D326_Biop_Pro1.obs['donor'] = 'D326'
D326_Biop_Pro1.obs['name'] = ['D326_Biop_Pro1_' + s for s in list(D326_Biop_Pro1.obs.index)]
D326_Biop_Pro1.obs_names = D326_Biop_Pro1.obs['name']
D326_Biop_Pro1
filtered = sc.read_10x_mtx(
'./D326_Biop_Pro1/' + outsPathFilter,
var_names='gene_symbols',
cache=True)
minCount = min(filtered.X.sum(axis=1).A1)
D326_Biop_Pro1.obs['n_counts'] = D326_Biop_Pro1.X.sum(axis=1).A1
sc.pp.filter_cells(D326_Biop_Pro1, min_genes=0)
D326_Biop_Pro1.obs_keys
qc_df = D326_Biop_Pro1.obs[['n_genes','n_counts']]
qc_df = qc_df.sort_values(by = 'n_counts', ascending = False)
qc_df['cell_nb'] = range(0, qc_df.shape[0])
qc_df = qc_df[qc_df['n_counts'] > 0]
qc_df.shape
qc_df_subsetFull = qc_df[qc_df['n_counts'] > minCount]
print(qc_df_subsetFull.shape)
qc_df_subsetEmpty = qc_df[qc_df['n_counts'] <= 50]
qc_df_subsetEmpty = qc_df_subsetEmpty[qc_df_subsetEmpty['n_counts'] > 5]
print(qc_df_subsetEmpty.shape)
fig, ax = plt.subplots()
ax.plot(qc_df[['cell_nb']], qc_df[['n_counts']])
ax.set_xscale('log'); ax.set_yscale('log')
plt.axhline(y=10, color='r') # Limit to empty droplets
plt.axhline(y=minCount, color='k') # Limit to filtered dataset
plt.show()
empty = D326_Biop_Pro1[qc_df_subsetEmpty.index.tolist()]
empty.var['n_cells'] = empty.X.sum(axis=0).A1
back = sum(empty.var['n_cells'])/empty.shape[0] # Background score for the sample
#empty.var.sort_values(by = 'n_cells', ascending = False).iloc[0:50].index.tolist()
D326_Biop_Pro1 = D326_Biop_Pro1[qc_df_subsetFull.index.tolist()]
D326_Biop_Pro1.obs['background'] = back
D326_Biop_Pro1.obs['empty_droplets'] = empty.shape[0]
D326_Biop_Pro1.var['background_gene'] = empty.var['n_cells']
sc.pp.filter_cells(D326_Biop_Pro1, min_genes=0)
mito_genes = D326_Biop_Pro1.var_names.str.startswith('MT-')
D326_Biop_Pro1.obs['percent_mito'] = np.sum(
D326_Biop_Pro1[:, mito_genes].X, axis=1).A1 / np.sum(D326_Biop_Pro1.X, axis=1).A1
D326_Biop_Pro1.obs['n_counts'] = D326_Biop_Pro1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D326_Biop_Pro1.to_df())
ribo_genes = D326_Biop_Pro1.to_df().columns.isin(RB_genes_in_df)
D326_Biop_Pro1.obs['percent_ribo'] = np.sum(
D326_Biop_Pro1[:, ribo_genes].X, axis=1).A1 / np.sum(D326_Biop_Pro1.X, axis=1).A1
D326_Biop_Pro1.var['ribo_genes'] = [not i for i in ribo_genes]
sc.pl.violin(D326_Biop_Pro1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
jitter=0.4, multi_panel=True)
sc.pp.filter_cells(D326_Biop_Pro1, min_genes=500)
D326_Biop_Pro1 = D326_Biop_Pro1[D326_Biop_Pro1.obs['n_counts'] < 40000, :]
D326_Biop_Pro1 = D326_Biop_Pro1[D326_Biop_Pro1.obs['percent_mito'] < 0.5, :]
sc.pp.normalize_per_cell(D326_Biop_Pro1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D326_Biop_Pro1) # log transform the data
D326_Biop_Pro1.raw = D326_Biop_Pro1
D326_Biop_Pro1_raw = D326_Biop_Pro1 # freeze the object (for later use of the raw state of it)
D326_Biop_Pro1 = D326_Biop_Pro1[:, D326_Biop_Pro1.var['ribo_genes']]
D326_Biop_Pro1
D339_Biop_Pro1 = sc.read_10x_mtx(
'./D339_Biop_Pro1/' + outsPath,
var_names='gene_symbols',
cache=True)
D339_Biop_Pro1.var_names_make_unique()
D339_Biop_Pro1.obs['manip'] = 'D339_Biop_Pro1'
D339_Biop_Pro1.obs['position'] = 'Proximal'
D339_Biop_Pro1.obs['method'] = 'Biopsy'
D339_Biop_Pro1.obs['donor'] = 'D339'
D339_Biop_Pro1.obs['name'] = ['D339_Biop_Pro1_' + s for s in list(D339_Biop_Pro1.obs.index)]
D339_Biop_Pro1.obs_names = D339_Biop_Pro1.obs['name']
D339_Biop_Pro1
filtered = sc.read_10x_mtx(
'./D339_Biop_Pro1/' + outsPathFilter,
var_names='gene_symbols',
cache=True)
minCount = min(filtered.X.sum(axis=1).A1)
D339_Biop_Pro1.obs['n_counts'] = D339_Biop_Pro1.X.sum(axis=1).A1
sc.pp.filter_cells(D339_Biop_Pro1, min_genes=0)
D339_Biop_Pro1.obs_keys
qc_df = D339_Biop_Pro1.obs[['n_genes','n_counts']]
qc_df = qc_df.sort_values(by = 'n_counts', ascending = False)
qc_df['cell_nb'] = range(0, qc_df.shape[0])
qc_df = qc_df[qc_df['n_counts'] > 0]
qc_df.shape
qc_df_subsetFull = qc_df[qc_df['n_counts'] > minCount]
print(qc_df_subsetFull.shape)
qc_df_subsetEmpty = qc_df[qc_df['n_counts'] <= 50]
qc_df_subsetEmpty = qc_df_subsetEmpty[qc_df_subsetEmpty['n_counts'] > 5]
print(qc_df_subsetEmpty.shape)
fig, ax = plt.subplots()
ax.plot(qc_df[['cell_nb']], qc_df[['n_counts']])
ax.set_xscale('log'); ax.set_yscale('log')
plt.axhline(y=10, color='r') # Limit to empty droplets
plt.axhline(y=minCount, color='k') # Limit to filtered dataset
plt.show()
empty = D339_Biop_Pro1[qc_df_subsetEmpty.index.tolist()]
empty.var['n_cells'] = empty.X.sum(axis=0).A1
back = sum(empty.var['n_cells'])/empty.shape[0] # Background score for the sample
#empty.var.sort_values(by = 'n_cells', ascending = False).iloc[0:50].index.tolist()
D339_Biop_Pro1 = D339_Biop_Pro1[qc_df_subsetFull.index.tolist()]
D339_Biop_Pro1.obs['background'] = back
D339_Biop_Pro1.obs['empty_droplets'] = empty.shape[0]
D339_Biop_Pro1.var['background_gene'] = empty.var['n_cells']
sc.pp.filter_cells(D339_Biop_Pro1, min_genes=0)
mito_genes = D339_Biop_Pro1.var_names.str.startswith('MT-')
D339_Biop_Pro1.obs['percent_mito'] = np.sum(
D339_Biop_Pro1[:, mito_genes].X, axis=1).A1 / np.sum(D339_Biop_Pro1.X, axis=1).A1
D339_Biop_Pro1.obs['n_counts'] = D339_Biop_Pro1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D339_Biop_Pro1.to_df())
ribo_genes = D339_Biop_Pro1.to_df().columns.isin(RB_genes_in_df)
D339_Biop_Pro1.obs['percent_ribo'] = np.sum(
D339_Biop_Pro1[:, ribo_genes].X, axis=1).A1 / np.sum(D339_Biop_Pro1.X, axis=1).A1
D339_Biop_Pro1.var['ribo_genes'] = [not i for i in ribo_genes]
sc.pl.violin(D339_Biop_Pro1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
jitter=0.4, multi_panel=True)
sc.pp.filter_cells(D339_Biop_Pro1, min_genes=500)
D339_Biop_Pro1 = D339_Biop_Pro1[D339_Biop_Pro1.obs['n_counts'] < 40000, :]
D339_Biop_Pro1 = D339_Biop_Pro1[D339_Biop_Pro1.obs['percent_mito'] < 0.2, :]
sc.pp.normalize_per_cell(D339_Biop_Pro1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D339_Biop_Pro1) # log transform the data
D339_Biop_Pro1.raw = D339_Biop_Pro1
D339_Biop_Pro1_raw = D339_Biop_Pro1 # freeze the object (for later use of the raw state of it)
D339_Biop_Pro1 = D339_Biop_Pro1[:, D339_Biop_Pro1.var['ribo_genes']]
D339_Biop_Pro1
D344_Biop_Pro1 = sc.read_10x_mtx(
'./D344_Biop_Pro1/' + outsPath,
var_names='gene_symbols',
cache=True)
D344_Biop_Pro1.var_names_make_unique()
D344_Biop_Pro1.obs['manip'] = 'D344_Biop_Pro1'
D344_Biop_Pro1.obs['position'] = 'Proximal'
D344_Biop_Pro1.obs['method'] = 'Biopsy'
D344_Biop_Pro1.obs['donor'] = 'D344'
D344_Biop_Pro1.obs['name'] = ['D344_Biop_Pro1_' + s for s in list(D344_Biop_Pro1.obs.index)]
D344_Biop_Pro1.obs_names = D344_Biop_Pro1.obs['name']
D344_Biop_Pro1
filtered = sc.read_10x_mtx(
'./D344_Biop_Pro1/' + outsPathFilter,
var_names='gene_symbols',
cache=True)
minCount = min(filtered.X.sum(axis=1).A1)
D344_Biop_Pro1.obs['n_counts'] = D344_Biop_Pro1.X.sum(axis=1).A1
sc.pp.filter_cells(D344_Biop_Pro1, min_genes=0)
D344_Biop_Pro1.obs_keys
qc_df = D344_Biop_Pro1.obs[['n_genes','n_counts']]
qc_df = qc_df.sort_values(by = 'n_counts', ascending = False)
qc_df['cell_nb'] = range(0, qc_df.shape[0])
qc_df = qc_df[qc_df['n_counts'] > 0]
qc_df.shape
qc_df_subsetFull = qc_df[qc_df['n_counts'] > minCount]
print(qc_df_subsetFull.shape)
qc_df_subsetEmpty = qc_df[qc_df['n_counts'] <= 50]
qc_df_subsetEmpty = qc_df_subsetEmpty[qc_df_subsetEmpty['n_counts'] > 5]
print(qc_df_subsetEmpty.shape)
fig, ax = plt.subplots()
ax.plot(qc_df[['cell_nb']], qc_df[['n_counts']])
ax.set_xscale('log'); ax.set_yscale('log')
plt.axhline(y=10, color='r') # Limit to empty droplets
plt.axhline(y=minCount, color='k') # Limit to filtered dataset
plt.show()
empty = D344_Biop_Pro1[qc_df_subsetEmpty.index.tolist()]
empty.var['n_cells'] = empty.X.sum(axis=0).A1
back = sum(empty.var['n_cells'])/empty.shape[0] # Background score for the sample
#empty.var.sort_values(by = 'n_cells', ascending = False).iloc[0:50].index.tolist()
D344_Biop_Pro1 = D344_Biop_Pro1[qc_df_subsetFull.index.tolist()]
D344_Biop_Pro1.obs['background'] = back
D344_Biop_Pro1.obs['empty_droplets'] = empty.shape[0]
D344_Biop_Pro1.var['background_gene'] = empty.var['n_cells']
sc.pp.filter_cells(D344_Biop_Pro1, min_genes=0)
mito_genes = D344_Biop_Pro1.var_names.str.startswith('MT-')
D344_Biop_Pro1.obs['percent_mito'] = np.sum(
D344_Biop_Pro1[:, mito_genes].X, axis=1).A1 / np.sum(D344_Biop_Pro1.X, axis=1).A1
D344_Biop_Pro1.obs['n_counts'] = D344_Biop_Pro1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D344_Biop_Pro1.to_df())
ribo_genes = D344_Biop_Pro1.to_df().columns.isin(RB_genes_in_df)
D344_Biop_Pro1.obs['percent_ribo'] = np.sum(
D344_Biop_Pro1[:, ribo_genes].X, axis=1).A1 / np.sum(D344_Biop_Pro1.X, axis=1).A1
D344_Biop_Pro1.var['ribo_genes'] = [not i for i in ribo_genes]
sc.pl.violin(D344_Biop_Pro1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
jitter=0.4, multi_panel=True)
sc.pp.filter_cells(D344_Biop_Pro1, min_genes=500)
D344_Biop_Pro1 = D344_Biop_Pro1[D344_Biop_Pro1.obs['n_counts'] < 40000, :]
D344_Biop_Pro1 = D344_Biop_Pro1[D344_Biop_Pro1.obs['percent_mito'] < 0.15, :]
sc.pp.normalize_per_cell(D344_Biop_Pro1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D344_Biop_Pro1) # log transform the data
D344_Biop_Pro1.raw = D344_Biop_Pro1
D344_Biop_Pro1_raw = D344_Biop_Pro1 # freeze the object (for later use of the raw state of it)
D344_Biop_Pro1 = D344_Biop_Pro1[:, D344_Biop_Pro1.var['ribo_genes']]
D344_Biop_Pro1
D353_Biop_Pro1 = sc.read_10x_mtx(
'./D353_Biop_Pro1/' + outsPath,
var_names='gene_symbols',
cache=True)
D353_Biop_Pro1.var_names_make_unique()
D353_Biop_Pro1.obs['manip'] = 'D353_Biop_Pro1'
D353_Biop_Pro1.obs['position'] = 'Proximal'
D353_Biop_Pro1.obs['method'] = 'Biopsy'
D353_Biop_Pro1.obs['donor'] = 'D353'
D353_Biop_Pro1.obs['name'] = ['D353_Biop_Pro1' + s for s in list(D353_Biop_Pro1.obs.index)]
D353_Biop_Pro1.obs_names = D353_Biop_Pro1.obs['name']
D353_Biop_Pro1
filtered = sc.read_10x_mtx(
'./D353_Biop_Pro1/' + outsPathFilter,
var_names='gene_symbols',
cache=True)
minCount = min(filtered.X.sum(axis=1).A1)
D353_Biop_Pro1.obs['n_counts'] = D353_Biop_Pro1.X.sum(axis=1).A1
sc.pp.filter_cells(D353_Biop_Pro1, min_genes=0)
D353_Biop_Pro1.obs_keys
qc_df = D353_Biop_Pro1.obs[['n_genes','n_counts']]
qc_df = qc_df.sort_values(by = 'n_counts', ascending = False)
qc_df['cell_nb'] = range(0, qc_df.shape[0])
qc_df = qc_df[qc_df['n_counts'] > 0]
qc_df.shape
qc_df_subsetFull = qc_df[qc_df['n_counts'] > minCount]
print(qc_df_subsetFull.shape)
qc_df_subsetEmpty = qc_df[qc_df['n_counts'] <= 50]
qc_df_subsetEmpty = qc_df_subsetEmpty[qc_df_subsetEmpty['n_counts'] > 5]
print(qc_df_subsetEmpty.shape)
fig, ax = plt.subplots()
ax.plot(qc_df[['cell_nb']], qc_df[['n_counts']])
ax.set_xscale('log'); ax.set_yscale('log')
plt.axhline(y=10, color='r') # Limit to empty droplets
plt.axhline(y=minCount, color='k') # Limit to filtered dataset
plt.show()
empty = D353_Biop_Pro1[qc_df_subsetEmpty.index.tolist()]
empty.var['n_cells'] = empty.X.sum(axis=0).A1
back = sum(empty.var['n_cells'])/empty.shape[0] # Background score for the sample
#empty.var.sort_values(by = 'n_cells', ascending = False).iloc[0:50].index.tolist()
D353_Biop_Pro1 = D353_Biop_Pro1[qc_df_subsetFull.index.tolist()]
D353_Biop_Pro1.obs['background'] = back
D353_Biop_Pro1.obs['empty_droplets'] = empty.shape[0]
D353_Biop_Pro1.var['background_gene'] = empty.var['n_cells']
sc.pp.filter_cells(D353_Biop_Pro1, min_genes=0)
mito_genes = D353_Biop_Pro1.var_names.str.startswith('MT-')
D353_Biop_Pro1.obs['percent_mito'] = np.sum(
D353_Biop_Pro1[:, mito_genes].X, axis=1).A1 / np.sum(D353_Biop_Pro1.X, axis=1).A1
D353_Biop_Pro1.obs['n_counts'] = D353_Biop_Pro1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D353_Biop_Pro1.to_df())
ribo_genes = D353_Biop_Pro1.to_df().columns.isin(RB_genes_in_df)
D353_Biop_Pro1.obs['percent_ribo'] = np.sum(
D353_Biop_Pro1[:, ribo_genes].X, axis=1).A1 / np.sum(D353_Biop_Pro1.X, axis=1).A1
D353_Biop_Pro1.var['ribo_genes'] = [not i for i in ribo_genes]
sc.pl.violin(D353_Biop_Pro1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
jitter=0.4, multi_panel=True)
sc.pp.filter_cells(D353_Biop_Pro1, min_genes=500)
D353_Biop_Pro1 = D353_Biop_Pro1[D353_Biop_Pro1.obs['n_counts'] < 15000, :]
D353_Biop_Pro1 = D353_Biop_Pro1[D353_Biop_Pro1.obs['percent_mito'] < 0.25, :]
sc.pp.normalize_per_cell(D353_Biop_Pro1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D353_Biop_Pro1) # log transform the data
D353_Biop_Pro1.raw = D353_Biop_Pro1
D353_Biop_Pro1_raw = D353_Biop_Pro1 # freeze the object (for later use of the raw state of it)
D353_Biop_Pro1 = D353_Biop_Pro1[:, D353_Biop_Pro1.var['ribo_genes']]
D353_Biop_Pro1
D354_Biop_Pro1 = sc.read_10x_mtx(
'./D354_Biop_Pro1/' + outsPath,
var_names='gene_symbols',
cache=True)
D354_Biop_Pro1.var_names_make_unique()
D354_Biop_Pro1.obs['manip'] = 'D354_Biop_Pro1'
D354_Biop_Pro1.obs['position'] = 'Proximal'
D354_Biop_Pro1.obs['method'] = 'Biopsy'
D354_Biop_Pro1.obs['donor'] = 'D354'
D354_Biop_Pro1.obs['name'] = ['D354_Biop_Pro1_' + s for s in list(D354_Biop_Pro1.obs.index)]
D354_Biop_Pro1.obs_names = D354_Biop_Pro1.obs['name']
D354_Biop_Pro1
filtered = sc.read_10x_mtx(
'./D354_Biop_Pro1/' + outsPathFilter,
var_names='gene_symbols',
cache=True)
minCount = min(filtered.X.sum(axis=1).A1)
D354_Biop_Pro1.obs['n_counts'] = D354_Biop_Pro1.X.sum(axis=1).A1
sc.pp.filter_cells(D354_Biop_Pro1, min_genes=0)
D354_Biop_Pro1.obs_keys
qc_df = D354_Biop_Pro1.obs[['n_genes','n_counts']]
qc_df = qc_df.sort_values(by = 'n_counts', ascending = False)
qc_df['cell_nb'] = range(0, qc_df.shape[0])
qc_df = qc_df[qc_df['n_counts'] > 0]
qc_df.shape
qc_df_subsetFull = qc_df[qc_df['n_counts'] > minCount]
print(qc_df_subsetFull.shape)
qc_df_subsetEmpty = qc_df[qc_df['n_counts'] <= 50]
qc_df_subsetEmpty = qc_df_subsetEmpty[qc_df_subsetEmpty['n_counts'] > 5]
print(qc_df_subsetEmpty.shape)
fig, ax = plt.subplots()
ax.plot(qc_df[['cell_nb']], qc_df[['n_counts']])
ax.set_xscale('log'); ax.set_yscale('log')
plt.axhline(y=10, color='r') # Limit to empty droplets
plt.axhline(y=minCount, color='k') # Limit to filtered dataset
plt.show()
empty = D354_Biop_Pro1[qc_df_subsetEmpty.index.tolist()]
empty.var['n_cells'] = empty.X.sum(axis=0).A1
back = sum(empty.var['n_cells'])/empty.shape[0] # Background score for the sample
#empty.var.sort_values(by = 'n_cells', ascending = False).iloc[0:50].index.tolist()
D354_Biop_Pro1 = D354_Biop_Pro1[qc_df_subsetFull.index.tolist()]
D354_Biop_Pro1.obs['background'] = back
D354_Biop_Pro1.obs['empty_droplets'] = empty.shape[0]
D354_Biop_Pro1.var['background_gene'] = empty.var['n_cells']
sc.pp.filter_cells(D354_Biop_Pro1, min_genes=0)
mito_genes = D354_Biop_Pro1.var_names.str.startswith('MT-')
D354_Biop_Pro1.obs['percent_mito'] = np.sum(
D354_Biop_Pro1[:, mito_genes].X, axis=1).A1 / np.sum(D354_Biop_Pro1.X, axis=1).A1
D354_Biop_Pro1.obs['n_counts'] = D354_Biop_Pro1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D354_Biop_Pro1.to_df())
ribo_genes = D354_Biop_Pro1.to_df().columns.isin(RB_genes_in_df)
D354_Biop_Pro1.obs['percent_ribo'] = np.sum(
D354_Biop_Pro1[:, ribo_genes].X, axis=1).A1 / np.sum(D354_Biop_Pro1.X, axis=1).A1
D354_Biop_Pro1.var['ribo_genes'] = [not i for i in ribo_genes]
sc.pl.violin(D354_Biop_Pro1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
jitter=0.4, multi_panel=True)
sc.pp.filter_cells(D354_Biop_Pro1, min_genes=500)
D354_Biop_Pro1 = D354_Biop_Pro1[D354_Biop_Pro1.obs['n_counts'] < 30000, :]
D354_Biop_Pro1 = D354_Biop_Pro1[D354_Biop_Pro1.obs['percent_mito'] < 0.15, :]
sc.pp.normalize_per_cell(D354_Biop_Pro1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D354_Biop_Pro1) # log transform the data
D354_Biop_Pro1.raw = D354_Biop_Pro1
D354_Biop_Pro1_raw = D354_Biop_Pro1 # freeze the object (for later use of the raw state of it)
D354_Biop_Pro1 = D354_Biop_Pro1[:, D354_Biop_Pro1.var['ribo_genes']]
D354_Biop_Pro1
D363_Biop_Pro1 = sc.read_10x_mtx(
'./D363_Biop_Pro1/' + outsPath,
var_names='gene_symbols',
cache=True)
D363_Biop_Pro1.var_names_make_unique()
D363_Biop_Pro1.obs['manip'] = 'D363_Biop_Pro1'
D363_Biop_Pro1.obs['position'] = 'Proximal'
D363_Biop_Pro1.obs['method'] = 'Biopsy'
D363_Biop_Pro1.obs['donor'] = 'D363'
D363_Biop_Pro1.obs['name'] = ['D363_Biop_Pro1_' + s for s in list(D363_Biop_Pro1.obs.index)]
D363_Biop_Pro1.obs_names = D363_Biop_Pro1.obs['name']
D363_Biop_Pro1
filtered = sc.read_10x_mtx(
'./D363_Biop_Pro1/' + outsPathFilter,
var_names='gene_symbols',
cache=True)
minCount = min(filtered.X.sum(axis=1).A1)
D363_Biop_Pro1.obs['n_counts'] = D363_Biop_Pro1.X.sum(axis=1).A1
sc.pp.filter_cells(D363_Biop_Pro1, min_genes=0)
D363_Biop_Pro1.obs_keys
qc_df = D363_Biop_Pro1.obs[['n_genes','n_counts']]
qc_df = qc_df.sort_values(by = 'n_counts', ascending = False)
qc_df['cell_nb'] = range(0, qc_df.shape[0])
qc_df = qc_df[qc_df['n_counts'] > 0]
qc_df.shape
qc_df_subsetFull = qc_df[qc_df['n_counts'] > minCount]
print(qc_df_subsetFull.shape)
qc_df_subsetEmpty = qc_df[qc_df['n_counts'] <= 50]
qc_df_subsetEmpty = qc_df_subsetEmpty[qc_df_subsetEmpty['n_counts'] > 5]
print(qc_df_subsetEmpty.shape)
fig, ax = plt.subplots()
ax.plot(qc_df[['cell_nb']], qc_df[['n_counts']])
ax.set_xscale('log'); ax.set_yscale('log')
plt.axhline(y=10, color='r') # Limit to empty droplets
plt.axhline(y=minCount, color='k') # Limit to filtered dataset
plt.show()
empty = D363_Biop_Pro1[qc_df_subsetEmpty.index.tolist()]
empty.var['n_cells'] = empty.X.sum(axis=0).A1
back = sum(empty.var['n_cells'])/empty.shape[0] # Background score for the sample
#empty.var.sort_values(by = 'n_cells', ascending = False).iloc[0:50].index.tolist()
D363_Biop_Pro1 = D363_Biop_Pro1[qc_df_subsetFull.index.tolist()]
D363_Biop_Pro1.obs['background'] = back
D363_Biop_Pro1.obs['empty_droplets'] = empty.shape[0]
D363_Biop_Pro1.var['background_gene'] = empty.var['n_cells']
sc.pp.filter_cells(D363_Biop_Pro1, min_genes=0)
mito_genes = D363_Biop_Pro1.var_names.str.startswith('MT-')
D363_Biop_Pro1.obs['percent_mito'] = np.sum(
D363_Biop_Pro1[:, mito_genes].X, axis=1).A1 / np.sum(D363_Biop_Pro1.X, axis=1).A1
D363_Biop_Pro1.obs['n_counts'] = D363_Biop_Pro1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D363_Biop_Pro1.to_df())
ribo_genes = D363_Biop_Pro1.to_df().columns.isin(RB_genes_in_df)
D363_Biop_Pro1.obs['percent_ribo'] = np.sum(
D363_Biop_Pro1[:, ribo_genes].X, axis=1).A1 / np.sum(D363_Biop_Pro1.X, axis=1).A1
D363_Biop_Pro1.var['ribo_genes'] = [not i for i in ribo_genes]
sc.pl.violin(D363_Biop_Pro1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
jitter=0.4, multi_panel=True)
sc.pp.filter_cells(D363_Biop_Pro1, min_genes=500)
D363_Biop_Pro1 = D363_Biop_Pro1[D363_Biop_Pro1.obs['n_counts'] < 15000, :]
D363_Biop_Pro1 = D363_Biop_Pro1[D363_Biop_Pro1.obs['percent_mito'] < 0.25, :]
sc.pp.normalize_per_cell(D363_Biop_Pro1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D363_Biop_Pro1) # log transform the data
D363_Biop_Pro1.raw = D363_Biop_Pro1
D363_Biop_Pro1_raw = D363_Biop_Pro1 # freeze the object (for later use of the raw state of it)
D363_Biop_Pro1 = D363_Biop_Pro1[:, D363_Biop_Pro1.var['ribo_genes']]
D363_Biop_Pro1
D367_Biop_Pro1 = sc.read_10x_mtx(
'./D367_Biop_Pro1/' + outsPath,
var_names='gene_symbols',
cache=True)
D367_Biop_Pro1.var_names_make_unique()
D367_Biop_Pro1.obs['manip'] = 'D367_Biop_Pro1'
D367_Biop_Pro1.obs['position'] = 'Proximal'
D367_Biop_Pro1.obs['method'] = 'Biopsy'
D367_Biop_Pro1.obs['donor'] = 'D367'
D367_Biop_Pro1.obs['name'] = ['D367_Biop_Pro1_' + s for s in list(D367_Biop_Pro1.obs.index)]
D367_Biop_Pro1.obs_names = D367_Biop_Pro1.obs['name']
D367_Biop_Pro1
filtered = sc.read_10x_mtx(
'./D367_Biop_Pro1/' + outsPathFilter,
var_names='gene_symbols',
cache=True)
minCount = min(filtered.X.sum(axis=1).A1)
D367_Biop_Pro1.obs['n_counts'] = D367_Biop_Pro1.X.sum(axis=1).A1
sc.pp.filter_cells(D367_Biop_Pro1, min_genes=0)
D367_Biop_Pro1.obs_keys
qc_df = D367_Biop_Pro1.obs[['n_genes','n_counts']]
qc_df = qc_df.sort_values(by = 'n_counts', ascending = False)
qc_df['cell_nb'] = range(0, qc_df.shape[0])
qc_df = qc_df[qc_df['n_counts'] > 0]
qc_df.shape
qc_df_subsetFull = qc_df[qc_df['n_counts'] > minCount]
print(qc_df_subsetFull.shape)
qc_df_subsetEmpty = qc_df[qc_df['n_counts'] <= 50]
qc_df_subsetEmpty = qc_df_subsetEmpty[qc_df_subsetEmpty['n_counts'] > 5]
print(qc_df_subsetEmpty.shape)
fig, ax = plt.subplots()
ax.plot(qc_df[['cell_nb']], qc_df[['n_counts']])
ax.set_xscale('log'); ax.set_yscale('log')
plt.axhline(y=10, color='r') # Limit to empty droplets
plt.axhline(y=minCount, color='k') # Limit to filtered dataset
plt.show()
empty = D367_Biop_Pro1[qc_df_subsetEmpty.index.tolist()]
empty.var['n_cells'] = empty.X.sum(axis=0).A1
back = sum(empty.var['n_cells'])/empty.shape[0] # Background score for the sample
#empty.var.sort_values(by = 'n_cells', ascending = False).iloc[0:50].index.tolist()
D367_Biop_Pro1 = D367_Biop_Pro1[qc_df_subsetFull.index.tolist()]
D367_Biop_Pro1.obs['background'] = back
D367_Biop_Pro1.obs['empty_droplets'] = empty.shape[0]
D367_Biop_Pro1.var['background_gene'] = empty.var['n_cells']
sc.pp.filter_cells(D367_Biop_Pro1, min_genes=0)
mito_genes = D367_Biop_Pro1.var_names.str.startswith('MT-')
D367_Biop_Pro1.obs['percent_mito'] = np.sum(
D367_Biop_Pro1[:, mito_genes].X, axis=1).A1 / np.sum(D367_Biop_Pro1.X, axis=1).A1
D367_Biop_Pro1.obs['n_counts'] = D367_Biop_Pro1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D367_Biop_Pro1.to_df())
ribo_genes = D367_Biop_Pro1.to_df().columns.isin(RB_genes_in_df)
D367_Biop_Pro1.obs['percent_ribo'] = np.sum(
D367_Biop_Pro1[:, ribo_genes].X, axis=1).A1 / np.sum(D367_Biop_Pro1.X, axis=1).A1
D367_Biop_Pro1.var['ribo_genes'] = [not i for i in ribo_genes]
sc.pl.violin(D367_Biop_Pro1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
jitter=0.4, multi_panel=True)
sc.pp.filter_cells(D367_Biop_Pro1, min_genes=500)
D367_Biop_Pro1 = D367_Biop_Pro1[D367_Biop_Pro1.obs['n_counts'] < 30000, :]
D367_Biop_Pro1 = D367_Biop_Pro1[D367_Biop_Pro1.obs['percent_mito'] < 0.4, :]
sc.pp.normalize_per_cell(D367_Biop_Pro1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D367_Biop_Pro1) # log transform the data
D367_Biop_Pro1.raw = D367_Biop_Pro1
D367_Biop_Pro1_raw = D367_Biop_Pro1 # freeze the object (for later use of the raw state of it)
D367_Biop_Pro1 = D367_Biop_Pro1[:, D367_Biop_Pro1.var['ribo_genes']]
D367_Biop_Pro1
D372_Biop_Pro1 = sc.read_10x_mtx(
'./D372_Biop_Pro1/' + outsPath,
var_names='gene_symbols',
cache=True)
D372_Biop_Pro1.var_names_make_unique()
D372_Biop_Pro1.obs['manip'] = 'D372_Biop_Pro1'
D372_Biop_Pro1.obs['position'] = 'Proximal'
D372_Biop_Pro1.obs['method'] = 'Biopsy'
D372_Biop_Pro1.obs['donor'] = 'D372'
D372_Biop_Pro1.obs['name'] = ['D372_Biop_Pro1_' + s for s in list(D372_Biop_Pro1.obs.index)]
D372_Biop_Pro1.obs_names = D372_Biop_Pro1.obs['name']
D372_Biop_Pro1
filtered = sc.read_10x_mtx(
'./D372_Biop_Pro1/' + outsPathFilter,
var_names='gene_symbols',
cache=True)
minCount = min(filtered.X.sum(axis=1).A1)
D372_Biop_Pro1.obs['n_counts'] = D372_Biop_Pro1.X.sum(axis=1).A1
sc.pp.filter_cells(D372_Biop_Pro1, min_genes=0)
D372_Biop_Pro1.obs_keys
qc_df = D372_Biop_Pro1.obs[['n_genes','n_counts']]
qc_df = qc_df.sort_values(by = 'n_counts', ascending = False)
qc_df['cell_nb'] = range(0, qc_df.shape[0])
qc_df = qc_df[qc_df['n_counts'] > 0]
qc_df.shape
qc_df_subsetFull = qc_df[qc_df['n_counts'] > minCount]
print(qc_df_subsetFull.shape)
qc_df_subsetEmpty = qc_df[qc_df['n_counts'] <= 50]
qc_df_subsetEmpty = qc_df_subsetEmpty[qc_df_subsetEmpty['n_counts'] > 5]
print(qc_df_subsetEmpty.shape)
fig, ax = plt.subplots()
ax.plot(qc_df[['cell_nb']], qc_df[['n_counts']])
ax.set_xscale('log'); ax.set_yscale('log')
plt.axhline(y=10, color='r') # Limit to empty droplets
plt.axhline(y=minCount, color='k') # Limit to filtered dataset
plt.show()
empty = D372_Biop_Pro1[qc_df_subsetEmpty.index.tolist()]
empty.var['n_cells'] = empty.X.sum(axis=0).A1
back = sum(empty.var['n_cells'])/empty.shape[0] # Background score for the sample
#empty.var.sort_values(by = 'n_cells', ascending = False).iloc[0:50].index.tolist()
D372_Biop_Pro1 = D372_Biop_Pro1[qc_df_subsetFull.index.tolist()]
D372_Biop_Pro1.obs['background'] = back
D372_Biop_Pro1.obs['empty_droplets'] = empty.shape[0]
D372_Biop_Pro1.var['background_gene'] = empty.var['n_cells']
sc.pp.filter_cells(D372_Biop_Pro1, min_genes=0)
mito_genes = D372_Biop_Pro1.var_names.str.startswith('MT-')
D372_Biop_Pro1.obs['percent_mito'] = np.sum(
D372_Biop_Pro1[:, mito_genes].X, axis=1).A1 / np.sum(D372_Biop_Pro1.X, axis=1).A1
D372_Biop_Pro1.obs['n_counts'] = D372_Biop_Pro1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D372_Biop_Pro1.to_df())
ribo_genes = D372_Biop_Pro1.to_df().columns.isin(RB_genes_in_df)
D372_Biop_Pro1.obs['percent_ribo'] = np.sum(
D372_Biop_Pro1[:, ribo_genes].X, axis=1).A1 / np.sum(D372_Biop_Pro1.X, axis=1).A1
D372_Biop_Pro1.var['ribo_genes'] = [not i for i in ribo_genes]
sc.pl.violin(D372_Biop_Pro1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
jitter=0.4, multi_panel=True)
sc.pp.filter_cells(D372_Biop_Pro1, min_genes=500)
D372_Biop_Pro1 = D372_Biop_Pro1[D372_Biop_Pro1.obs['n_counts'] < 30000, :]
D372_Biop_Pro1 = D372_Biop_Pro1[D372_Biop_Pro1.obs['percent_mito'] < 0.3, :]
sc.pp.normalize_per_cell(D372_Biop_Pro1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D372_Biop_Pro1) # log transform the data
D372_Biop_Pro1.raw = D372_Biop_Pro1
D372_Biop_Pro1_raw = D372_Biop_Pro1 # freeze the object (for later use of the raw state of it)
D372_Biop_Pro1 = D372_Biop_Pro1[:, D372_Biop_Pro1.var['ribo_genes']]
D372_Biop_Pro1
D322_Biop_Int1 = sc.read_10x_mtx(
'./D322_Biop_Int1/' + outsPath,
var_names='gene_symbols',
cache=True)
D322_Biop_Int1.var_names_make_unique()
D322_Biop_Int1.obs['manip'] = 'D322_Biop_Int1'
D322_Biop_Int1.obs['position'] = 'Intermediate'
D322_Biop_Int1.obs['method'] = 'Biopsy'
D322_Biop_Int1.obs['donor'] = 'D322'
D322_Biop_Int1.obs['name'] = ['D322_Biop_Int1_' + s for s in list(D322_Biop_Int1.obs.index)]
D322_Biop_Int1.obs_names = D322_Biop_Int1.obs['name']
D322_Biop_Int1
filtered = sc.read_10x_mtx(
'./D322_Biop_Int1/' + outsPathFilter,
var_names='gene_symbols',
cache=True)
minCount = min(filtered.X.sum(axis=1).A1)
D322_Biop_Int1.obs['n_counts'] = D322_Biop_Int1.X.sum(axis=1).A1
sc.pp.filter_cells(D322_Biop_Int1, min_genes=0)
D322_Biop_Int1.obs_keys
qc_df = D322_Biop_Int1.obs[['n_genes','n_counts']]
qc_df = qc_df.sort_values(by = 'n_counts', ascending = False)
qc_df['cell_nb'] = range(0, qc_df.shape[0])
qc_df = qc_df[qc_df['n_counts'] > 0]
qc_df.shape
qc_df_subsetFull = qc_df[qc_df['n_counts'] > minCount]
print(qc_df_subsetFull.shape)
qc_df_subsetEmpty = qc_df[qc_df['n_counts'] <= 50]
qc_df_subsetEmpty = qc_df_subsetEmpty[qc_df_subsetEmpty['n_counts'] > 5]
print(qc_df_subsetEmpty.shape)
fig, ax = plt.subplots()
ax.plot(qc_df[['cell_nb']], qc_df[['n_counts']])
ax.set_xscale('log'); ax.set_yscale('log')
plt.axhline(y=10, color='r') # Limit to empty droplets
plt.axhline(y=minCount, color='k') # Limit to filtered dataset
plt.show()
empty = D322_Biop_Int1[qc_df_subsetEmpty.index.tolist()]
empty.var['n_cells'] = empty.X.sum(axis=0).A1
back = sum(empty.var['n_cells'])/empty.shape[0] # Background score for the sample
#empty.var.sort_values(by = 'n_cells', ascending = False).iloc[0:50].index.tolist()
D322_Biop_Int1 = D322_Biop_Int1[qc_df_subsetFull.index.tolist()]
D322_Biop_Int1.obs['background'] = back
D322_Biop_Int1.obs['empty_droplets'] = empty.shape[0]
D322_Biop_Int1.var['background_gene'] = empty.var['n_cells']
sc.pp.filter_cells(D322_Biop_Int1, min_genes=0)
mito_genes = D322_Biop_Int1.var_names.str.startswith('MT-')
D322_Biop_Int1.obs['percent_mito'] = np.sum(
D322_Biop_Int1[:, mito_genes].X, axis=1).A1 / np.sum(D322_Biop_Int1.X, axis=1).A1
D322_Biop_Int1.obs['n_counts'] = D322_Biop_Int1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D322_Biop_Int1.to_df())
ribo_genes = D322_Biop_Int1.to_df().columns.isin(RB_genes_in_df)
D322_Biop_Int1.obs['percent_ribo'] = np.sum(
D322_Biop_Int1[:, ribo_genes].X, axis=1).A1 / np.sum(D322_Biop_Int1.X, axis=1).A1
D322_Biop_Int1.var['ribo_genes'] = [not i for i in ribo_genes]
sc.pl.violin(D322_Biop_Int1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
jitter=0.4, multi_panel=True)
sc.pp.filter_cells(D322_Biop_Int1, min_genes=500)
D322_Biop_Int1 = D322_Biop_Int1[D322_Biop_Int1.obs['n_counts'] < 20000, :]
D322_Biop_Int1 = D322_Biop_Int1[D322_Biop_Int1.obs['percent_mito'] < 0.2 , :]
sc.pp.normalize_per_cell(D322_Biop_Int1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D322_Biop_Int1) # log transform the data
D322_Biop_Int1.raw = D322_Biop_Int1
D322_Biop_Int1_raw = D322_Biop_Int1 # freeze the object (for later use of the raw state of it)
D322_Biop_Int1 = D322_Biop_Int1[:, D322_Biop_Int1.var['ribo_genes']]
D322_Biop_Int1
D326_Biop_Int1 = sc.read_10x_mtx(
'./D326_Biop_Int1/' + outsPath,
var_names='gene_symbols',
cache=True)
D326_Biop_Int1.var_names_make_unique()
D326_Biop_Int1.obs['manip'] = 'D326_Biop_Int1'
D326_Biop_Int1.obs['position'] = 'Intermediate'
D326_Biop_Int1.obs['method'] = 'Biopsy'
D326_Biop_Int1.obs['donor'] = 'D326'
D326_Biop_Int1.obs['name'] = ['D326_Biop_Int1_' + s for s in list(D326_Biop_Int1.obs.index)]
D326_Biop_Int1.obs_names = D326_Biop_Int1.obs['name']
D326_Biop_Int1
filtered = sc.read_10x_mtx(
'./D326_Biop_Int1/' + outsPathFilter,
var_names='gene_symbols',
cache=True)
minCount = min(filtered.X.sum(axis=1).A1)
D326_Biop_Int1.obs['n_counts'] = D326_Biop_Int1.X.sum(axis=1).A1
sc.pp.filter_cells(D326_Biop_Int1, min_genes=0)
D326_Biop_Int1.obs_keys
qc_df = D326_Biop_Int1.obs[['n_genes','n_counts']]
qc_df = qc_df.sort_values(by = 'n_counts', ascending = False)
qc_df['cell_nb'] = range(0, qc_df.shape[0])
qc_df = qc_df[qc_df['n_counts'] > 0]
qc_df.shape
qc_df_subsetFull = qc_df[qc_df['n_counts'] > minCount]
print(qc_df_subsetFull.shape)
qc_df_subsetEmpty = qc_df[qc_df['n_counts'] <= 50]
qc_df_subsetEmpty = qc_df_subsetEmpty[qc_df_subsetEmpty['n_counts'] > 5]
print(qc_df_subsetEmpty.shape)
fig, ax = plt.subplots()
ax.plot(qc_df[['cell_nb']], qc_df[['n_counts']])
ax.set_xscale('log'); ax.set_yscale('log')
plt.axhline(y=10, color='r') # Limit to empty droplets
plt.axhline(y=minCount, color='k') # Limit to filtered dataset
plt.show()
empty = D326_Biop_Int1[qc_df_subsetEmpty.index.tolist()]
empty.var['n_cells'] = empty.X.sum(axis=0).A1
back = sum(empty.var['n_cells'])/empty.shape[0] # Background score for the sample
#empty.var.sort_values(by = 'n_cells', ascending = False).iloc[0:50].index.tolist()
D326_Biop_Int1 = D326_Biop_Int1[qc_df_subsetFull.index.tolist()]
D326_Biop_Int1.obs['background'] = back
D326_Biop_Int1.obs['empty_droplets'] = empty.shape[0]
D326_Biop_Int1.var['background_gene'] = empty.var['n_cells']
sc.pp.filter_cells(D326_Biop_Int1, min_genes=0)
mito_genes = D326_Biop_Int1.var_names.str.startswith('MT-')
D326_Biop_Int1.obs['percent_mito'] = np.sum(
D326_Biop_Int1[:, mito_genes].X, axis=1).A1 / np.sum(D326_Biop_Int1.X, axis=1).A1
D326_Biop_Int1.obs['n_counts'] = D326_Biop_Int1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D326_Biop_Int1.to_df())
ribo_genes = D326_Biop_Int1.to_df().columns.isin(RB_genes_in_df)
D326_Biop_Int1.obs['percent_ribo'] = np.sum(
D326_Biop_Int1[:, ribo_genes].X, axis=1).A1 / np.sum(D326_Biop_Int1.X, axis=1).A1
D326_Biop_Int1.var['ribo_genes'] = [not i for i in ribo_genes]
sc.pl.violin(D326_Biop_Int1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
jitter=0.4, multi_panel=True)
sc.pp.filter_cells(D326_Biop_Int1, min_genes=500)
D326_Biop_Int1 = D326_Biop_Int1[D326_Biop_Int1.obs['n_counts'] < 25000, :]
D326_Biop_Int1 = D326_Biop_Int1[D326_Biop_Int1.obs['percent_mito'] < 0.3, :]
sc.pp.normalize_per_cell(D326_Biop_Int1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D326_Biop_Int1) # log transform the data
D326_Biop_Int1.raw = D326_Biop_Int1
D326_Biop_Int1_raw = D326_Biop_Int1 # freeze the object (for later use of the raw state of it)
D326_Biop_Int1 = D326_Biop_Int1[:, D326_Biop_Int1.var['ribo_genes']]
D326_Biop_Int1
D339_Biop_Int1 = sc.read_10x_mtx(
'./D339_Biop_Int1/' + outsPath,
var_names='gene_symbols',
cache=True)
D339_Biop_Int1.var_names_make_unique()
D339_Biop_Int1.obs['manip'] = 'D339_Biop_Int1'
D339_Biop_Int1.obs['position'] = 'Intermediate'
D339_Biop_Int1.obs['method'] = 'Biopsy'
D339_Biop_Int1.obs['donor'] = 'D339'
D339_Biop_Int1.obs['name'] = ['D339_Biop_Int1_' + s for s in list(D339_Biop_Int1.obs.index)]
D339_Biop_Int1.obs_names = D339_Biop_Int1.obs['name']
D339_Biop_Int1
filtered = sc.read_10x_mtx(
'./D339_Biop_Int1/' + outsPathFilter,
var_names='gene_symbols',
cache=True)
minCount = min(filtered.X.sum(axis=1).A1)
D339_Biop_Int1.obs['n_counts'] = D339_Biop_Int1.X.sum(axis=1).A1
sc.pp.filter_cells(D339_Biop_Int1, min_genes=0)
D339_Biop_Int1.obs_keys
qc_df = D339_Biop_Int1.obs[['n_genes','n_counts']]
qc_df = qc_df.sort_values(by = 'n_counts', ascending = False)
qc_df['cell_nb'] = range(0, qc_df.shape[0])
qc_df = qc_df[qc_df['n_counts'] > 0]
qc_df.shape
qc_df_subsetFull = qc_df[qc_df['n_counts'] > minCount]
print(qc_df_subsetFull.shape)
qc_df_subsetEmpty = qc_df[qc_df['n_counts'] <= 50]
qc_df_subsetEmpty = qc_df_subsetEmpty[qc_df_subsetEmpty['n_counts'] > 5]
print(qc_df_subsetEmpty.shape)
fig, ax = plt.subplots()
ax.plot(qc_df[['cell_nb']], qc_df[['n_counts']])
ax.set_xscale('log'); ax.set_yscale('log')
plt.axhline(y=10, color='r') # Limit to empty droplets
plt.axhline(y=minCount, color='k') # Limit to filtered dataset
plt.show()
empty = D339_Biop_Int1[qc_df_subsetEmpty.index.tolist()]
empty.var['n_cells'] = empty.X.sum(axis=0).A1
back = sum(empty.var['n_cells'])/empty.shape[0] # Background score for the sample
#empty.var.sort_values(by = 'n_cells', ascending = False).iloc[0:50].index.tolist()
D339_Biop_Int1 = D339_Biop_Int1[qc_df_subsetFull.index.tolist()]
D339_Biop_Int1.obs['background'] = back
D339_Biop_Int1.obs['empty_droplets'] = empty.shape[0]
D339_Biop_Int1.var['background_gene'] = empty.var['n_cells']
sc.pp.filter_cells(D339_Biop_Int1, min_genes=0)
mito_genes = D339_Biop_Int1.var_names.str.startswith('MT-')
D339_Biop_Int1.obs['percent_mito'] = np.sum(
D339_Biop_Int1[:, mito_genes].X, axis=1).A1 / np.sum(D339_Biop_Int1.X, axis=1).A1
D339_Biop_Int1.obs['n_counts'] = D339_Biop_Int1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D339_Biop_Int1.to_df())
ribo_genes = D339_Biop_Int1.to_df().columns.isin(RB_genes_in_df)
D339_Biop_Int1.obs['percent_ribo'] = np.sum(
D339_Biop_Int1[:, ribo_genes].X, axis=1).A1 / np.sum(D339_Biop_Int1.X, axis=1).A1
D339_Biop_Int1.var['ribo_genes'] = [not i for i in ribo_genes]
sc.pl.violin(D339_Biop_Int1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
jitter=0.4, multi_panel=True)
sc.pp.filter_cells(D339_Biop_Int1, min_genes=500)
D339_Biop_Int1 = D339_Biop_Int1[D339_Biop_Int1.obs['n_counts'] < 30000, :]
D339_Biop_Int1 = D339_Biop_Int1[D339_Biop_Int1.obs['percent_mito'] < 0.15, :]
sc.pp.normalize_per_cell(D339_Biop_Int1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D339_Biop_Int1) # log transform the data
D339_Biop_Int1.raw = D339_Biop_Int1
D339_Biop_Int1_raw = D339_Biop_Int1 # freeze the object (for later use of the raw state of it)
D339_Biop_Int1 = D339_Biop_Int1[:, D339_Biop_Int1.var['ribo_genes']]
D339_Biop_Int1
D344_Biop_Int1 = sc.read_10x_mtx(
'./D344_Biop_Int1/' + outsPath,
var_names='gene_symbols',
cache=True)
D344_Biop_Int1.var_names_make_unique()
D344_Biop_Int1.obs['manip'] = 'D344_Biop_Int1'
D344_Biop_Int1.obs['position'] = 'Intermediate'
D344_Biop_Int1.obs['method'] = 'Biopsy'
D344_Biop_Int1.obs['donor'] = 'D344'
D344_Biop_Int1.obs['name'] = ['D344_Biop_Int1_' + s for s in list(D344_Biop_Int1.obs.index)]
D344_Biop_Int1.obs_names = D344_Biop_Int1.obs['name']
D344_Biop_Int1
filtered = sc.read_10x_mtx(
'./D344_Biop_Int1/' + outsPathFilter,
var_names='gene_symbols',
cache=True)
minCount = min(filtered.X.sum(axis=1).A1)
D344_Biop_Int1.obs['n_counts'] = D344_Biop_Int1.X.sum(axis=1).A1
sc.pp.filter_cells(D344_Biop_Int1, min_genes=0)
D344_Biop_Int1.obs_keys
qc_df = D344_Biop_Int1.obs[['n_genes','n_counts']]
qc_df = qc_df.sort_values(by = 'n_counts', ascending = False)
qc_df['cell_nb'] = range(0, qc_df.shape[0])
qc_df = qc_df[qc_df['n_counts'] > 0]
qc_df.shape
qc_df_subsetFull = qc_df[qc_df['n_counts'] > minCount]
print(qc_df_subsetFull.shape)
qc_df_subsetEmpty = qc_df[qc_df['n_counts'] <= 50]
qc_df_subsetEmpty = qc_df_subsetEmpty[qc_df_subsetEmpty['n_counts'] > 5]
print(qc_df_subsetEmpty.shape)
fig, ax = plt.subplots()
ax.plot(qc_df[['cell_nb']], qc_df[['n_counts']])
ax.set_xscale('log'); ax.set_yscale('log')
plt.axhline(y=10, color='r') # Limit to empty droplets
plt.axhline(y=minCount, color='k') # Limit to filtered dataset
plt.show()
empty = D344_Biop_Int1[qc_df_subsetEmpty.index.tolist()]
empty.var['n_cells'] = empty.X.sum(axis=0).A1
back = sum(empty.var['n_cells'])/empty.shape[0] # Background score for the sample
#empty.var.sort_values(by = 'n_cells', ascending = False).iloc[0:50].index.tolist()
D344_Biop_Int1 = D344_Biop_Int1[qc_df_subsetFull.index.tolist()]
D344_Biop_Int1.obs['background'] = back
D344_Biop_Int1.obs['empty_droplets'] = empty.shape[0]
D344_Biop_Int1.var['background_gene'] = empty.var['n_cells']
sc.pp.filter_cells(D344_Biop_Int1, min_genes=0)
mito_genes = D344_Biop_Int1.var_names.str.startswith('MT-')
D344_Biop_Int1.obs['percent_mito'] = np.sum(
D344_Biop_Int1[:, mito_genes].X, axis=1).A1 / np.sum(D344_Biop_Int1.X, axis=1).A1
D344_Biop_Int1.obs['n_counts'] = D344_Biop_Int1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D344_Biop_Int1.to_df())
ribo_genes = D344_Biop_Int1.to_df().columns.isin(RB_genes_in_df)
D344_Biop_Int1.obs['percent_ribo'] = np.sum(
D344_Biop_Int1[:, ribo_genes].X, axis=1).A1 / np.sum(D344_Biop_Int1.X, axis=1).A1
D344_Biop_Int1.var['ribo_genes'] = [not i for i in ribo_genes]
sc.pl.violin(D344_Biop_Int1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
jitter=0.4, multi_panel=True)
sc.pp.filter_cells(D344_Biop_Int1, min_genes=500)
D344_Biop_Int1 = D344_Biop_Int1[D344_Biop_Int1.obs['n_counts'] < 10000, :]
D344_Biop_Int1 = D344_Biop_Int1[D344_Biop_Int1.obs['percent_mito'] < 0.1, :]
sc.pp.normalize_per_cell(D344_Biop_Int1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D344_Biop_Int1) # log transform the data
D344_Biop_Int1.raw = D344_Biop_Int1
D344_Biop_Int1_raw = D344_Biop_Int1 # freeze the object (for later use of the raw state of it)
D344_Biop_Int1 = D344_Biop_Int1[:, D344_Biop_Int1.var['ribo_genes']]
D344_Biop_Int1
D353_Biop_Int2 = sc.read_10x_mtx(
'./D353_Biop_Int2/' + outsPath,
var_names='gene_symbols',
cache=True)
D353_Biop_Int2.var_names_make_unique()
D353_Biop_Int2.obs['manip'] = 'D353_Biop_Int2'
D353_Biop_Int2.obs['position'] = 'Intermediate'
D353_Biop_Int2.obs['method'] = 'Biopsy'
D353_Biop_Int2.obs['donor'] = 'D353'
D353_Biop_Int2.obs['name'] = ['D353_Biop_Int2_' + s for s in list(D353_Biop_Int2.obs.index)]
D353_Biop_Int2.obs_names = D353_Biop_Int2.obs['name']
D353_Biop_Int2
filtered = sc.read_10x_mtx(
'./D353_Biop_Int2/' + outsPathFilter,
var_names='gene_symbols',
cache=True)
minCount = min(filtered.X.sum(axis=1).A1)
D353_Biop_Int2.obs['n_counts'] = D353_Biop_Int2.X.sum(axis=1).A1
sc.pp.filter_cells(D353_Biop_Int2, min_genes=0)
D353_Biop_Int2.obs_keys
qc_df = D353_Biop_Int2.obs[['n_genes','n_counts']]
qc_df = qc_df.sort_values(by = 'n_counts', ascending = False)
qc_df['cell_nb'] = range(0, qc_df.shape[0])
qc_df = qc_df[qc_df['n_counts'] > 0]
qc_df.shape
qc_df_subsetFull = qc_df[qc_df['n_counts'] > minCount]
print(qc_df_subsetFull.shape)
qc_df_subsetEmpty = qc_df[qc_df['n_counts'] <= 50]
qc_df_subsetEmpty = qc_df_subsetEmpty[qc_df_subsetEmpty['n_counts'] > 5]
print(qc_df_subsetEmpty.shape)
fig, ax = plt.subplots()
ax.plot(qc_df[['cell_nb']], qc_df[['n_counts']])
ax.set_xscale('log'); ax.set_yscale('log')
plt.axhline(y=10, color='r') # Limit to empty droplets
plt.axhline(y=minCount, color='k') # Limit to filtered dataset
plt.show()
empty = D353_Biop_Int2[qc_df_subsetEmpty.index.tolist()]
empty.var['n_cells'] = empty.X.sum(axis=0).A1
back = sum(empty.var['n_cells'])/empty.shape[0] # Background score for the sample
#empty.var.sort_values(by = 'n_cells', ascending = False).iloc[0:50].index.tolist()
D353_Biop_Int2 = D353_Biop_Int2[qc_df_subsetFull.index.tolist()]
D353_Biop_Int2.obs['background'] = back
D353_Biop_Int2.obs['empty_droplets'] = empty.shape[0]
D353_Biop_Int2.var['background_gene'] = empty.var['n_cells']
sc.pp.filter_cells(D353_Biop_Int2, min_genes=0)
mito_genes = D353_Biop_Int2.var_names.str.startswith('MT-')
D353_Biop_Int2.obs['percent_mito'] = np.sum(
D353_Biop_Int2[:, mito_genes].X, axis=1).A1 / np.sum(D353_Biop_Int2.X, axis=1).A1
D353_Biop_Int2.obs['n_counts'] = D353_Biop_Int2.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D353_Biop_Int2.to_df())
ribo_genes = D353_Biop_Int2.to_df().columns.isin(RB_genes_in_df)
D353_Biop_Int2.obs['percent_ribo'] = np.sum(
D353_Biop_Int2[:, ribo_genes].X, axis=1).A1 / np.sum(D353_Biop_Int2.X, axis=1).A1
D353_Biop_Int2.var['ribo_genes'] = [not i for i in ribo_genes]
sc.pl.violin(D353_Biop_Int2, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
jitter=0.4, multi_panel=True)
sc.pp.filter_cells(D353_Biop_Int2, min_genes=500)
D353_Biop_Int2 = D353_Biop_Int2[D353_Biop_Int2.obs['n_counts'] < 10000, :]
D353_Biop_Int2 = D353_Biop_Int2[D353_Biop_Int2.obs['percent_mito'] < 0.15, :]
sc.pp.normalize_per_cell(D353_Biop_Int2, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D353_Biop_Int2) # log transform the data
D353_Biop_Int2.raw = D353_Biop_Int2
D353_Biop_Int2_raw = D353_Biop_Int2 # freeze the object (for later use of the raw state of it)
D353_Biop_Int2 = D353_Biop_Int2[:, D353_Biop_Int2.var['ribo_genes']]
D353_Biop_Int2
D354_Biop_Int2 = sc.read_10x_mtx(
'./D354_Biop_Int2/' + outsPath,
var_names='gene_symbols',
cache=True)
D354_Biop_Int2.var_names_make_unique()
D354_Biop_Int2.obs['manip'] = 'D354_Biop_Int2'
D354_Biop_Int2.obs['position'] = 'Intermediate'
D354_Biop_Int2.obs['method'] = 'Biopsy'
D354_Biop_Int2.obs['donor'] = 'D354'
D354_Biop_Int2.obs['name'] = ['D354_Biop_Int2_' + s for s in list(D354_Biop_Int2.obs.index)]
D354_Biop_Int2.obs_names = D354_Biop_Int2.obs['name']
D354_Biop_Int2
filtered = sc.read_10x_mtx(
'./D354_Biop_Int2/' + outsPathFilter,
var_names='gene_symbols',
cache=True)
minCount = min(filtered.X.sum(axis=1).A1)
D354_Biop_Int2.obs['n_counts'] = D354_Biop_Int2.X.sum(axis=1).A1
sc.pp.filter_cells(D354_Biop_Int2, min_genes=0)
D354_Biop_Int2.obs_keys
qc_df = D354_Biop_Int2.obs[['n_genes','n_counts']]
qc_df = qc_df.sort_values(by = 'n_counts', ascending = False)
qc_df['cell_nb'] = range(0, qc_df.shape[0])
qc_df = qc_df[qc_df['n_counts'] > 0]
qc_df.shape
qc_df_subsetFull = qc_df[qc_df['n_counts'] > minCount]
print(qc_df_subsetFull.shape)
qc_df_subsetEmpty = qc_df[qc_df['n_counts'] <= 50]
qc_df_subsetEmpty = qc_df_subsetEmpty[qc_df_subsetEmpty['n_counts'] > 5]
print(qc_df_subsetEmpty.shape)
fig, ax = plt.subplots()
ax.plot(qc_df[['cell_nb']], qc_df[['n_counts']])
ax.set_xscale('log'); ax.set_yscale('log')
plt.axhline(y=10, color='r') # Limit to empty droplets
plt.axhline(y=minCount, color='k') # Limit to filtered dataset
plt.show()
empty = D354_Biop_Int2[qc_df_subsetEmpty.index.tolist()]
empty.var['n_cells'] = empty.X.sum(axis=0).A1
back = sum(empty.var['n_cells'])/empty.shape[0] # Background score for the sample
#empty.var.sort_values(by = 'n_cells', ascending = False).iloc[0:50].index.tolist()
D354_Biop_Int2 = D354_Biop_Int2[qc_df_subsetFull.index.tolist()]
D354_Biop_Int2.obs['background'] = back
D354_Biop_Int2.obs['empty_droplets'] = empty.shape[0]
D354_Biop_Int2.var['background_gene'] = empty.var['n_cells']
sc.pp.filter_cells(D354_Biop_Int2, min_genes=0)
mito_genes = D354_Biop_Int2.var_names.str.startswith('MT-')
D354_Biop_Int2.obs['percent_mito'] = np.sum(
D354_Biop_Int2[:, mito_genes].X, axis=1).A1 / np.sum(D354_Biop_Int2.X, axis=1).A1
D354_Biop_Int2.obs['n_counts'] = D354_Biop_Int2.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D354_Biop_Int2.to_df())
ribo_genes = D354_Biop_Int2.to_df().columns.isin(RB_genes_in_df)
D354_Biop_Int2.obs['percent_ribo'] = np.sum(
D354_Biop_Int2[:, ribo_genes].X, axis=1).A1 / np.sum(D354_Biop_Int2.X, axis=1).A1
D354_Biop_Int2.var['ribo_genes'] = [not i for i in ribo_genes]
sc.pl.violin(D354_Biop_Int2, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
jitter=0.4, multi_panel=True)
sc.pp.filter_cells(D354_Biop_Int2, min_genes=500)
D354_Biop_Int2 = D354_Biop_Int2[D354_Biop_Int2.obs['n_counts'] < 20000, :]
D354_Biop_Int2 = D354_Biop_Int2[D354_Biop_Int2.obs['percent_mito'] < 0.2, :]
sc.pp.normalize_per_cell(D354_Biop_Int2, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D354_Biop_Int2) # log transform the data
D354_Biop_Int2.raw = D354_Biop_Int2
D354_Biop_Int2_raw = D354_Biop_Int2 # freeze the object (for later use of the raw state of it)
D354_Biop_Int2 = D354_Biop_Int2[:, D354_Biop_Int2.var['ribo_genes']]
D354_Biop_Int2
D363_Biop_Int2 = sc.read_10x_mtx(
'./D363_Biop_Int2/' + outsPath,
var_names='gene_symbols',
cache=True)
D363_Biop_Int2.var_names_make_unique()
D363_Biop_Int2.obs['manip'] = 'D363_Biop_Int2'
D363_Biop_Int2.obs['position'] = 'Intermediate'
D363_Biop_Int2.obs['method'] = 'Biopsy'
D363_Biop_Int2.obs['donor'] = 'D363'
D363_Biop_Int2.obs['name'] = ['D363_Biop_Int2_' + s for s in list(D363_Biop_Int2.obs.index)]
D363_Biop_Int2.obs_names = D363_Biop_Int2.obs['name']
D363_Biop_Int2
filtered = sc.read_10x_mtx(
'./D363_Biop_Int2/' + outsPathFilter,
var_names='gene_symbols',
cache=True)
minCount = min(filtered.X.sum(axis=1).A1)
D363_Biop_Int2.obs['n_counts'] = D363_Biop_Int2.X.sum(axis=1).A1
sc.pp.filter_cells(D363_Biop_Int2, min_genes=0)
D363_Biop_Int2.obs_keys
qc_df = D363_Biop_Int2.obs[['n_genes','n_counts']]
qc_df = qc_df.sort_values(by = 'n_counts', ascending = False)
qc_df['cell_nb'] = range(0, qc_df.shape[0])
qc_df = qc_df[qc_df['n_counts'] > 0]
qc_df.shape
qc_df_subsetFull = qc_df[qc_df['n_counts'] > minCount]
print(qc_df_subsetFull.shape)
qc_df_subsetEmpty = qc_df[qc_df['n_counts'] <= 50]
qc_df_subsetEmpty = qc_df_subsetEmpty[qc_df_subsetEmpty['n_counts'] > 5]
print(qc_df_subsetEmpty.shape)
fig, ax = plt.subplots()
ax.plot(qc_df[['cell_nb']], qc_df[['n_counts']])
ax.set_xscale('log'); ax.set_yscale('log')
plt.axhline(y=10, color='r') # Limit to empty droplets
plt.axhline(y=minCount, color='k') # Limit to filtered dataset
plt.show()
empty = D363_Biop_Int2[qc_df_subsetEmpty.index.tolist()]
empty.var['n_cells'] = empty.X.sum(axis=0).A1
back = sum(empty.var['n_cells'])/empty.shape[0] # Background score for the sample
#empty.var.sort_values(by = 'n_cells', ascending = False).iloc[0:50].index.tolist()
D363_Biop_Int2 = D363_Biop_Int2[qc_df_subsetFull.index.tolist()]
D363_Biop_Int2.obs['background'] = back
D363_Biop_Int2.obs['empty_droplets'] = empty.shape[0]
D363_Biop_Int2.var['background_gene'] = empty.var['n_cells']
sc.pp.filter_cells(D363_Biop_Int2, min_genes=0)
mito_genes = D363_Biop_Int2.var_names.str.startswith('MT-')
D363_Biop_Int2.obs['percent_mito'] = np.sum(
D363_Biop_Int2[:, mito_genes].X, axis=1).A1 / np.sum(D363_Biop_Int2.X, axis=1).A1
D363_Biop_Int2.obs['n_counts'] = D363_Biop_Int2.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D363_Biop_Int2.to_df())
ribo_genes = D363_Biop_Int2.to_df().columns.isin(RB_genes_in_df)
D363_Biop_Int2.obs['percent_ribo'] = np.sum(
D363_Biop_Int2[:, ribo_genes].X, axis=1).A1 / np.sum(D363_Biop_Int2.X, axis=1).A1
D363_Biop_Int2.var['ribo_genes'] = [not i for i in ribo_genes]
sc.pl.violin(D363_Biop_Int2, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
jitter=0.4, multi_panel=True)
sc.pp.filter_cells(D363_Biop_Int2, min_genes=500)
D363_Biop_Int2 = D363_Biop_Int2[D363_Biop_Int2.obs['n_counts'] < 15000, :]
D363_Biop_Int2 = D363_Biop_Int2[D363_Biop_Int2.obs['percent_mito'] < 0.2, :]
sc.pp.normalize_per_cell(D363_Biop_Int2, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D363_Biop_Int2) # log transform the data
D363_Biop_Int2.raw = D363_Biop_Int2
D363_Biop_Int2_raw = D363_Biop_Int2 # freeze the object (for later use of the raw state of it)
D363_Biop_Int2 = D363_Biop_Int2[:, D363_Biop_Int2.var['ribo_genes']]
D363_Biop_Int2
D367_Biop_Int1 = sc.read_10x_mtx(
'./D367_Biop_Int1/' + outsPath,
var_names='gene_symbols',
cache=True)
D367_Biop_Int1.var_names_make_unique()
D367_Biop_Int1.obs['manip'] = 'D367_Biop_Int1'
D367_Biop_Int1.obs['position'] = 'Intermediate'
D367_Biop_Int1.obs['method'] = 'Biopsy'
D367_Biop_Int1.obs['donor'] = 'D367'
D367_Biop_Int1.obs['name'] = ['D367_Biop_Int1_' + s for s in list(D367_Biop_Int1.obs.index)]
D367_Biop_Int1.obs_names = D367_Biop_Int1.obs['name']
D367_Biop_Int1
filtered = sc.read_10x_mtx(
'./D367_Biop_Int1/' + outsPathFilter,
var_names='gene_symbols',
cache=True)
minCount = min(filtered.X.sum(axis=1).A1)
D367_Biop_Int1.obs['n_counts'] = D367_Biop_Int1.X.sum(axis=1).A1
sc.pp.filter_cells(D367_Biop_Int1, min_genes=0)
D367_Biop_Int1.obs_keys
qc_df = D367_Biop_Int1.obs[['n_genes','n_counts']]
qc_df = qc_df.sort_values(by = 'n_counts', ascending = False)
qc_df['cell_nb'] = range(0, qc_df.shape[0])
qc_df = qc_df[qc_df['n_counts'] > 0]
qc_df.shape
qc_df_subsetFull = qc_df[qc_df['n_counts'] > minCount]
print(qc_df_subsetFull.shape)
qc_df_subsetEmpty = qc_df[qc_df['n_counts'] <= 50]
qc_df_subsetEmpty = qc_df_subsetEmpty[qc_df_subsetEmpty['n_counts'] > 5]
print(qc_df_subsetEmpty.shape)
fig, ax = plt.subplots()
ax.plot(qc_df[['cell_nb']], qc_df[['n_counts']])
ax.set_xscale('log'); ax.set_yscale('log')
plt.axhline(y=10, color='r') # Limit to empty droplets
plt.axhline(y=minCount, color='k') # Limit to filtered dataset
plt.show()
empty = D367_Biop_Int1[qc_df_subsetEmpty.index.tolist()]
empty.var['n_cells'] = empty.X.sum(axis=0).A1
back = sum(empty.var['n_cells'])/empty.shape[0] # Background score for the sample
#empty.var.sort_values(by = 'n_cells', ascending = False).iloc[0:50].index.tolist()
D367_Biop_Int1 = D367_Biop_Int1[qc_df_subsetFull.index.tolist()]
D367_Biop_Int1.obs['background'] = back
D367_Biop_Int1.obs['empty_droplets'] = empty.shape[0]
D367_Biop_Int1.var['background_gene'] = empty.var['n_cells']
sc.pp.filter_cells(D367_Biop_Int1, min_genes=0)
mito_genes = D367_Biop_Int1.var_names.str.startswith('MT-')
D367_Biop_Int1.obs['percent_mito'] = np.sum(
D367_Biop_Int1[:, mito_genes].X, axis=1).A1 / np.sum(D367_Biop_Int1.X, axis=1).A1
D367_Biop_Int1.obs['n_counts'] = D367_Biop_Int1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D367_Biop_Int1.to_df())
ribo_genes = D367_Biop_Int1.to_df().columns.isin(RB_genes_in_df)
D367_Biop_Int1.obs['percent_ribo'] = np.sum(
D367_Biop_Int1[:, ribo_genes].X, axis=1).A1 / np.sum(D367_Biop_Int1.X, axis=1).A1
D367_Biop_Int1.var['ribo_genes'] = [not i for i in ribo_genes]
sc.pl.violin(D367_Biop_Int1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
jitter=0.4, multi_panel=True)
sc.pp.filter_cells(D367_Biop_Int1, min_genes=500)
D367_Biop_Int1 = D367_Biop_Int1[D367_Biop_Int1.obs['n_counts'] < 20000, :]
D367_Biop_Int1 = D367_Biop_Int1[D367_Biop_Int1.obs['percent_mito'] < 0.1, :]
sc.pp.normalize_per_cell(D367_Biop_Int1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D367_Biop_Int1) # log transform the data
D367_Biop_Int1.raw = D367_Biop_Int1
D367_Biop_Int1_raw = D367_Biop_Int1 # freeze the object (for later use of the raw state of it)
D367_Biop_Int1 = D367_Biop_Int1[:, D367_Biop_Int1.var['ribo_genes']]
D367_Biop_Int1
D372_Biop_Int1 = sc.read_10x_mtx(
'./D372_Biop_Int1/' + outsPath,
var_names='gene_symbols',
cache=True)
D372_Biop_Int1.var_names_make_unique()
D372_Biop_Int1.obs['manip'] = 'D372_Biop_Int1'
D372_Biop_Int1.obs['position'] = 'Intermediate'
D372_Biop_Int1.obs['method'] = 'Biopsy'
D372_Biop_Int1.obs['donor'] = 'D372'
D372_Biop_Int1.obs['name'] = ['D372_Biop_Int1_' + s for s in list(D372_Biop_Int1.obs.index)]
D372_Biop_Int1.obs_names = D372_Biop_Int1.obs['name']
D372_Biop_Int1
filtered = sc.read_10x_mtx(
'./D372_Biop_Int1/' + outsPathFilter,
var_names='gene_symbols',
cache=True)
minCount = min(filtered.X.sum(axis=1).A1)
D372_Biop_Int1.obs['n_counts'] = D372_Biop_Int1.X.sum(axis=1).A1
sc.pp.filter_cells(D372_Biop_Int1, min_genes=0)
D372_Biop_Int1.obs_keys
qc_df = D372_Biop_Int1.obs[['n_genes','n_counts']]
qc_df = qc_df.sort_values(by = 'n_counts', ascending = False)
qc_df['cell_nb'] = range(0, qc_df.shape[0])
qc_df = qc_df[qc_df['n_counts'] > 0]
qc_df.shape
qc_df_subsetFull = qc_df[qc_df['n_counts'] > minCount]
print(qc_df_subsetFull.shape)
qc_df_subsetEmpty = qc_df[qc_df['n_counts'] <= 50]
qc_df_subsetEmpty = qc_df_subsetEmpty[qc_df_subsetEmpty['n_counts'] > 5]
print(qc_df_subsetEmpty.shape)
fig, ax = plt.subplots()
ax.plot(qc_df[['cell_nb']], qc_df[['n_counts']])
ax.set_xscale('log'); ax.set_yscale('log')
plt.axhline(y=10, color='r') # Limit to empty droplets
plt.axhline(y=minCount, color='k') # Limit to filtered dataset
plt.show()
empty = D372_Biop_Int1[qc_df_subsetEmpty.index.tolist()]
empty.var['n_cells'] = empty.X.sum(axis=0).A1
back = sum(empty.var['n_cells'])/empty.shape[0] # Background score for the sample
#empty.var.sort_values(by = 'n_cells', ascending = False).iloc[0:50].index.tolist()
D372_Biop_Int1 = D372_Biop_Int1[qc_df_subsetFull.index.tolist()]
D372_Biop_Int1.obs['background'] = back
D372_Biop_Int1.obs['empty_droplets'] = empty.shape[0]
D372_Biop_Int1.var['background_gene'] = empty.var['n_cells']
sc.pp.filter_cells(D372_Biop_Int1, min_genes=0)
mito_genes = D372_Biop_Int1.var_names.str.startswith('MT-')
D372_Biop_Int1.obs['percent_mito'] = np.sum(
D372_Biop_Int1[:, mito_genes].X, axis=1).A1 / np.sum(D372_Biop_Int1.X, axis=1).A1
D372_Biop_Int1.obs['n_counts'] = D372_Biop_Int1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D372_Biop_Int1.to_df())
ribo_genes = D372_Biop_Int1.to_df().columns.isin(RB_genes_in_df)
D372_Biop_Int1.obs['percent_ribo'] = np.sum(
D372_Biop_Int1[:, ribo_genes].X, axis=1).A1 / np.sum(D372_Biop_Int1.X, axis=1).A1
D372_Biop_Int1.var['ribo_genes'] = [not i for i in ribo_genes]
sc.pl.violin(D372_Biop_Int1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
jitter=0.4, multi_panel=True)
sc.pp.filter_cells(D372_Biop_Int1, min_genes=500)
D372_Biop_Int1 = D372_Biop_Int1[D372_Biop_Int1.obs['n_counts'] < 20000, :]
D372_Biop_Int1 = D372_Biop_Int1[D372_Biop_Int1.obs['percent_mito'] < 0.2, :]
sc.pp.normalize_per_cell(D372_Biop_Int1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D372_Biop_Int1) # log transform the data
D372_Biop_Int1.raw = D372_Biop_Int1
D372_Biop_Int1_raw = D372_Biop_Int1 # freeze the object (for later use of the raw state of it)
D372_Biop_Int1 = D372_Biop_Int1[:, D372_Biop_Int1.var['ribo_genes']]
D372_Biop_Int1
D372_Biop_Int2 = sc.read_10x_mtx(
'./D372_Biop_Int2/' + outsPath,
var_names='gene_symbols',
cache=True)
D372_Biop_Int2.var_names_make_unique()
D372_Biop_Int2.obs['manip'] = 'D372_Biop_Int2'
D372_Biop_Int2.obs['position'] = 'Intermediate'
D372_Biop_Int2.obs['method'] = 'Biopsy'
D372_Biop_Int2.obs['donor'] = 'D372'
D372_Biop_Int2.obs['name'] = ['D372_Biop_Int2_' + s for s in list(D372_Biop_Int2.obs.index)]
D372_Biop_Int2.obs_names = D372_Biop_Int2.obs['name']
D372_Biop_Int2
filtered = sc.read_10x_mtx(
'./D372_Biop_Int2/' + outsPathFilter,
var_names='gene_symbols',
cache=True)
minCount = min(filtered.X.sum(axis=1).A1)
D372_Biop_Int2.obs['n_counts'] = D372_Biop_Int2.X.sum(axis=1).A1
sc.pp.filter_cells(D372_Biop_Int2, min_genes=0)
D372_Biop_Int2.obs_keys
qc_df = D372_Biop_Int2.obs[['n_genes','n_counts']]
qc_df = qc_df.sort_values(by = 'n_counts', ascending = False)
qc_df['cell_nb'] = range(0, qc_df.shape[0])
qc_df = qc_df[qc_df['n_counts'] > 0]
qc_df.shape
qc_df_subsetFull = qc_df[qc_df['n_counts'] > minCount]
print(qc_df_subsetFull.shape)
qc_df_subsetEmpty = qc_df[qc_df['n_counts'] <= 50]
qc_df_subsetEmpty = qc_df_subsetEmpty[qc_df_subsetEmpty['n_counts'] > 5]
print(qc_df_subsetEmpty.shape)
fig, ax = plt.subplots()
ax.plot(qc_df[['cell_nb']], qc_df[['n_counts']])
ax.set_xscale('log'); ax.set_yscale('log')
plt.axhline(y=10, color='r') # Limit to empty droplets
plt.axhline(y=minCount, color='k') # Limit to filtered dataset
plt.show()
empty = D372_Biop_Int2[qc_df_subsetEmpty.index.tolist()]
empty.var['n_cells'] = empty.X.sum(axis=0).A1
back = sum(empty.var['n_cells'])/empty.shape[0] # Background score for the sample
#empty.var.sort_values(by = 'n_cells', ascending = False).iloc[0:50].index.tolist()
D372_Biop_Int2 = D372_Biop_Int2[qc_df_subsetFull.index.tolist()]
D372_Biop_Int2.obs['background'] = back
D372_Biop_Int2.obs['empty_droplets'] = empty.shape[0]
D372_Biop_Int2.var['background_gene'] = empty.var['n_cells']
sc.pp.filter_cells(D372_Biop_Int2, min_genes=0)
mito_genes = D372_Biop_Int2.var_names.str.startswith('MT-')
D372_Biop_Int2.obs['percent_mito'] = np.sum(
D372_Biop_Int2[:, mito_genes].X, axis=1).A1 / np.sum(D372_Biop_Int2.X, axis=1).A1
D372_Biop_Int2.obs['n_counts'] = D372_Biop_Int2.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D372_Biop_Int2.to_df())
ribo_genes = D372_Biop_Int2.to_df().columns.isin(RB_genes_in_df)
D372_Biop_Int2.obs['percent_ribo'] = np.sum(
D372_Biop_Int2[:, ribo_genes].X, axis=1).A1 / np.sum(D372_Biop_Int2.X, axis=1).A1
D372_Biop_Int2.var['ribo_genes'] = [not i for i in ribo_genes]
sc.pl.violin(D372_Biop_Int2, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
jitter=0.4, multi_panel=True)
sc.pp.filter_cells(D372_Biop_Int2, min_genes=500)
D372_Biop_Int2 = D372_Biop_Int2[D372_Biop_Int2.obs['n_counts'] < 20000, :]
D372_Biop_Int2 = D372_Biop_Int2[D372_Biop_Int2.obs['percent_mito'] < 0.2, :]
sc.pp.normalize_per_cell(D372_Biop_Int2, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D372_Biop_Int2) # log transform the data
D372_Biop_Int2.raw = D372_Biop_Int2
D372_Biop_Int2_raw = D372_Biop_Int2 # freeze the object (for later use of the raw state of it)
D372_Biop_Int2 = D372_Biop_Int2[:, D372_Biop_Int2.var['ribo_genes']]
D372_Biop_Int2
D326_Brus_Dis1 = sc.read_10x_mtx(
'./D326_Brus_Dis1/' + outsPath,
var_names='gene_symbols',
cache=True)
D326_Brus_Dis1.var_names_make_unique()
D326_Brus_Dis1.obs['manip'] = 'D326_Brus_Dis1'
D326_Brus_Dis1.obs['position'] = 'Distal'
D326_Brus_Dis1.obs['method'] = 'Brushing'
D326_Brus_Dis1.obs['donor'] = 'D326'
D326_Brus_Dis1.obs['name'] = ['D326_Brus_Dis1_' + s for s in list(D326_Brus_Dis1.obs.index)]
D326_Brus_Dis1.obs_names = D326_Brus_Dis1.obs['name']
D326_Brus_Dis1
filtered = sc.read_10x_mtx(
'./D326_Brus_Dis1/' + outsPathFilter,
var_names='gene_symbols',
cache=True)
minCount = min(filtered.X.sum(axis=1).A1)
D326_Brus_Dis1.obs['n_counts'] = D326_Brus_Dis1.X.sum(axis=1).A1
sc.pp.filter_cells(D326_Brus_Dis1, min_genes=0)
D326_Brus_Dis1.obs_keys
qc_df = D326_Brus_Dis1.obs[['n_genes','n_counts']]
qc_df = qc_df.sort_values(by = 'n_counts', ascending = False)
qc_df['cell_nb'] = range(0, qc_df.shape[0])
qc_df = qc_df[qc_df['n_counts'] > 0]
qc_df.shape
qc_df_subsetFull = qc_df[qc_df['n_counts'] > minCount]
print(qc_df_subsetFull.shape)
qc_df_subsetEmpty = qc_df[qc_df['n_counts'] <= 50]
qc_df_subsetEmpty = qc_df_subsetEmpty[qc_df_subsetEmpty['n_counts'] > 5]
print(qc_df_subsetEmpty.shape)
fig, ax = plt.subplots()
ax.plot(qc_df[['cell_nb']], qc_df[['n_counts']])
ax.set_xscale('log'); ax.set_yscale('log')
plt.axhline(y=10, color='r') # Limit to empty droplets
plt.axhline(y=minCount, color='k') # Limit to filtered dataset
plt.show()
empty = D326_Brus_Dis1[qc_df_subsetEmpty.index.tolist()]
empty.var['n_cells'] = empty.X.sum(axis=0).A1
back = sum(empty.var['n_cells'])/empty.shape[0] # Background score for the sample
#empty.var.sort_values(by = 'n_cells', ascending = False).iloc[0:50].index.tolist()
D326_Brus_Dis1 = D326_Brus_Dis1[qc_df_subsetFull.index.tolist()]
D326_Brus_Dis1.obs['background'] = back
D326_Brus_Dis1.obs['empty_droplets'] = empty.shape[0]
D326_Brus_Dis1.var['background_gene'] = empty.var['n_cells']
sc.pp.filter_cells(D326_Brus_Dis1, min_genes=0)
mito_genes = D326_Brus_Dis1.var_names.str.startswith('MT-')
D326_Brus_Dis1.obs['percent_mito'] = np.sum(
D326_Brus_Dis1[:, mito_genes].X, axis=1).A1 / np.sum(D326_Brus_Dis1.X, axis=1).A1
D326_Brus_Dis1.obs['n_counts'] = D326_Brus_Dis1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D326_Brus_Dis1.to_df())
ribo_genes = D326_Brus_Dis1.to_df().columns.isin(RB_genes_in_df)
D326_Brus_Dis1.obs['percent_ribo'] = np.sum(
D326_Brus_Dis1[:, ribo_genes].X, axis=1).A1 / np.sum(D326_Brus_Dis1.X, axis=1).A1
D326_Brus_Dis1.var['ribo_genes'] = [not i for i in ribo_genes]
sc.pl.violin(D326_Brus_Dis1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
jitter=0.4, multi_panel=True)
sc.pp.filter_cells(D326_Brus_Dis1, min_genes=500)
D326_Brus_Dis1 = D326_Brus_Dis1[D326_Brus_Dis1.obs['n_counts'] < 15000, :]
D326_Brus_Dis1 = D326_Brus_Dis1[D326_Brus_Dis1.obs['percent_mito'] < 0.25 , :]
sc.pp.normalize_per_cell(D326_Brus_Dis1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D326_Brus_Dis1) # log transform the data
D326_Brus_Dis1.raw = D326_Brus_Dis1
D326_Brus_Dis1_raw = D326_Brus_Dis1 # freeze the object (for later use of the raw state of it)
D326_Brus_Dis1 = D326_Brus_Dis1[:, D326_Brus_Dis1.var['ribo_genes']]
D326_Brus_Dis1
D337_Brus_Dis1 = sc.read_10x_mtx(
'./D337_Brus_Dis1/' + outsPath,
var_names='gene_symbols',
cache=True)
D337_Brus_Dis1.var_names_make_unique()
D337_Brus_Dis1.obs['manip'] = 'D337_Brus_Dis1'
D337_Brus_Dis1.obs['position'] = 'Distal'
D337_Brus_Dis1.obs['method'] = 'Brushing'
D337_Brus_Dis1.obs['donor'] = 'D337'
D337_Brus_Dis1.obs['name'] = ['D337_Brus_Dis1' + s for s in list(D337_Brus_Dis1.obs.index)]
D337_Brus_Dis1.obs_names = D337_Brus_Dis1.obs['name']
D337_Brus_Dis1
filtered = sc.read_10x_mtx(
'./D337_Brus_Dis1/' + outsPathFilter,
var_names='gene_symbols',
cache=True)
minCount = min(filtered.X.sum(axis=1).A1)
D337_Brus_Dis1.obs['n_counts'] = D337_Brus_Dis1.X.sum(axis=1).A1
sc.pp.filter_cells(D337_Brus_Dis1, min_genes=0)
D337_Brus_Dis1.obs_keys
qc_df = D337_Brus_Dis1.obs[['n_genes','n_counts']]
qc_df = qc_df.sort_values(by = 'n_counts', ascending = False)
qc_df['cell_nb'] = range(0, qc_df.shape[0])
qc_df = qc_df[qc_df['n_counts'] > 0]
qc_df.shape
qc_df_subsetFull = qc_df[qc_df['n_counts'] > minCount]
print(qc_df_subsetFull.shape)
qc_df_subsetEmpty = qc_df[qc_df['n_counts'] <= 50]
qc_df_subsetEmpty = qc_df_subsetEmpty[qc_df_subsetEmpty['n_counts'] > 5]
print(qc_df_subsetEmpty.shape)
fig, ax = plt.subplots()
ax.plot(qc_df[['cell_nb']], qc_df[['n_counts']])
ax.set_xscale('log'); ax.set_yscale('log')
plt.axhline(y=10, color='r') # Limit to empty droplets
plt.axhline(y=minCount, color='k') # Limit to filtered dataset
plt.show()
empty = D337_Brus_Dis1[qc_df_subsetEmpty.index.tolist()]
empty.var['n_cells'] = empty.X.sum(axis=0).A1
back = sum(empty.var['n_cells'])/empty.shape[0] # Background score for the sample
#empty.var.sort_values(by = 'n_cells', ascending = False).iloc[0:50].index.tolist()
D337_Brus_Dis1 = D337_Brus_Dis1[qc_df_subsetFull.index.tolist()]
D337_Brus_Dis1.obs['background'] = back
D337_Brus_Dis1.obs['empty_droplets'] = empty.shape[0]
D337_Brus_Dis1.var['background_gene'] = empty.var['n_cells']
sc.pp.filter_cells(D337_Brus_Dis1, min_genes=0)
mito_genes = D337_Brus_Dis1.var_names.str.startswith('MT-')
D337_Brus_Dis1.obs['percent_mito'] = np.sum(
D337_Brus_Dis1[:, mito_genes].X, axis=1).A1 / np.sum(D337_Brus_Dis1.X, axis=1).A1
D337_Brus_Dis1.obs['n_counts'] = D337_Brus_Dis1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D337_Brus_Dis1.to_df())
ribo_genes = D337_Brus_Dis1.to_df().columns.isin(RB_genes_in_df)
D337_Brus_Dis1.obs['percent_ribo'] = np.sum(
D337_Brus_Dis1[:, ribo_genes].X, axis=1).A1 / np.sum(D337_Brus_Dis1.X, axis=1).A1
D337_Brus_Dis1.var['ribo_genes'] = [not i for i in ribo_genes]
sc.pl.violin(D337_Brus_Dis1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
jitter=0.4, multi_panel=True)
sc.pp.filter_cells(D337_Brus_Dis1, min_genes=500)
D337_Brus_Dis1 = D337_Brus_Dis1[D337_Brus_Dis1.obs['n_counts'] < 30000, :]
D337_Brus_Dis1 = D337_Brus_Dis1[D337_Brus_Dis1.obs['percent_mito'] < 0.5 , :]
sc.pp.normalize_per_cell(D337_Brus_Dis1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D337_Brus_Dis1) # log transform the data
D337_Brus_Dis1.raw = D337_Brus_Dis1
D337_Brus_Dis1_raw = D337_Brus_Dis1 # freeze the object (for later use of the raw state of it)
D337_Brus_Dis1 = D337_Brus_Dis1[:, D337_Brus_Dis1.var['ribo_genes']]
D337_Brus_Dis1
D339_Brus_Dis1 = sc.read_10x_mtx(
'./D339_Brus_Dis1/' + outsPath,
var_names='gene_symbols',
cache=True)
D339_Brus_Dis1.var_names_make_unique()
D339_Brus_Dis1.obs['manip'] = 'D339_Brus_Dis1'
D339_Brus_Dis1.obs['position'] = 'Distal'
D339_Brus_Dis1.obs['method'] = 'Brushing'
D339_Brus_Dis1.obs['donor'] = 'D339'
D339_Brus_Dis1.obs['name'] = ['D339_Brus_Dis1_' + s for s in list(D339_Brus_Dis1.obs.index)]
D339_Brus_Dis1.obs_names = D339_Brus_Dis1.obs['name']
D339_Brus_Dis1
filtered = sc.read_10x_mtx(
'./D339_Brus_Dis1/' + outsPathFilter,
var_names='gene_symbols',
cache=True)
minCount = min(filtered.X.sum(axis=1).A1)
D339_Brus_Dis1.obs['n_counts'] = D339_Brus_Dis1.X.sum(axis=1).A1
sc.pp.filter_cells(D339_Brus_Dis1, min_genes=0)
D339_Brus_Dis1.obs_keys
qc_df = D339_Brus_Dis1.obs[['n_genes','n_counts']]
qc_df = qc_df.sort_values(by = 'n_counts', ascending = False)
qc_df['cell_nb'] = range(0, qc_df.shape[0])
qc_df = qc_df[qc_df['n_counts'] > 0]
qc_df.shape
qc_df_subsetFull = qc_df[qc_df['n_counts'] > minCount]
print(qc_df_subsetFull.shape)
qc_df_subsetEmpty = qc_df[qc_df['n_counts'] <= 50]
qc_df_subsetEmpty = qc_df_subsetEmpty[qc_df_subsetEmpty['n_counts'] > 5]
print(qc_df_subsetEmpty.shape)
fig, ax = plt.subplots()
ax.plot(qc_df[['cell_nb']], qc_df[['n_counts']])
ax.set_xscale('log'); ax.set_yscale('log')
plt.axhline(y=10, color='r') # Limit to empty droplets
plt.axhline(y=minCount, color='k') # Limit to filtered dataset
plt.show()
empty = D339_Brus_Dis1[qc_df_subsetEmpty.index.tolist()]
empty.var['n_cells'] = empty.X.sum(axis=0).A1
back = sum(empty.var['n_cells'])/empty.shape[0] # Background score for the sample
#empty.var.sort_values(by = 'n_cells', ascending = False).iloc[0:50].index.tolist()
D339_Brus_Dis1 = D339_Brus_Dis1[qc_df_subsetFull.index.tolist()]
D339_Brus_Dis1.obs['background'] = back
D339_Brus_Dis1.obs['empty_droplets'] = empty.shape[0]
D339_Brus_Dis1.var['background_gene'] = empty.var['n_cells']
sc.pp.filter_cells(D339_Brus_Dis1, min_genes=0)
mito_genes = D339_Brus_Dis1.var_names.str.startswith('MT-')
D339_Brus_Dis1.obs['percent_mito'] = np.sum(
D339_Brus_Dis1[:, mito_genes].X, axis=1).A1 / np.sum(D339_Brus_Dis1.X, axis=1).A1
D339_Brus_Dis1.obs['n_counts'] = D339_Brus_Dis1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D339_Brus_Dis1.to_df())
ribo_genes = D339_Brus_Dis1.to_df().columns.isin(RB_genes_in_df)
D339_Brus_Dis1.obs['percent_ribo'] = np.sum(
D339_Brus_Dis1[:, ribo_genes].X, axis=1).A1 / np.sum(D339_Brus_Dis1.X, axis=1).A1
D339_Brus_Dis1.var['ribo_genes'] = [not i for i in ribo_genes]
sc.pl.violin(D339_Brus_Dis1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
jitter=0.4, multi_panel=True)
sc.pp.filter_cells(D339_Brus_Dis1, min_genes=500)
D339_Brus_Dis1 = D339_Brus_Dis1[D339_Brus_Dis1.obs['n_counts'] < 15000, :]
D339_Brus_Dis1 = D339_Brus_Dis1[D339_Brus_Dis1.obs['percent_mito'] < 0.5 , :]
sc.pp.normalize_per_cell(D339_Brus_Dis1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D339_Brus_Dis1) # log transform the data
D339_Brus_Dis1.raw = D339_Brus_Dis1
D339_Brus_Dis1_raw = D339_Brus_Dis1 # freeze the object (for later use of the raw state of it)
D339_Brus_Dis1 = D339_Brus_Dis1[:, D339_Brus_Dis1.var['ribo_genes']]
D339_Brus_Dis1
D344_Brus_Dis1 = sc.read_10x_mtx(
'./D344_Brus_Dis1/' + outsPath,
var_names='gene_symbols',
cache=True)
D344_Brus_Dis1.var_names_make_unique()
D344_Brus_Dis1.obs['manip'] = 'D344_Brus_Dis1'
D344_Brus_Dis1.obs['position'] = 'Distal'
D344_Brus_Dis1.obs['method'] = 'Brushing'
D344_Brus_Dis1.obs['donor'] = 'D344'
D344_Brus_Dis1.obs['name'] = ['D344_Brus_Dis1_' + s for s in list(D344_Brus_Dis1.obs.index)]
D344_Brus_Dis1.obs_names = D344_Brus_Dis1.obs['name']
D344_Brus_Dis1
filtered = sc.read_10x_mtx(
'./D344_Brus_Dis1/' + outsPathFilter,
var_names='gene_symbols',
cache=True)
minCount = min(filtered.X.sum(axis=1).A1)
D344_Brus_Dis1.obs['n_counts'] = D344_Brus_Dis1.X.sum(axis=1).A1
sc.pp.filter_cells(D344_Brus_Dis1, min_genes=0)
D344_Brus_Dis1.obs_keys
qc_df = D344_Brus_Dis1.obs[['n_genes','n_counts']]
qc_df = qc_df.sort_values(by = 'n_counts', ascending = False)
qc_df['cell_nb'] = range(0, qc_df.shape[0])
qc_df = qc_df[qc_df['n_counts'] > 0]
qc_df.shape
qc_df_subsetFull = qc_df[qc_df['n_counts'] > minCount]
print(qc_df_subsetFull.shape)
qc_df_subsetEmpty = qc_df[qc_df['n_counts'] <= 50]
qc_df_subsetEmpty = qc_df_subsetEmpty[qc_df_subsetEmpty['n_counts'] > 5]
print(qc_df_subsetEmpty.shape)
fig, ax = plt.subplots()
ax.plot(qc_df[['cell_nb']], qc_df[['n_counts']])
ax.set_xscale('log'); ax.set_yscale('log')
plt.axhline(y=10, color='r') # Limit to empty droplets
plt.axhline(y=minCount, color='k') # Limit to filtered dataset
plt.show()
empty = D344_Brus_Dis1[qc_df_subsetEmpty.index.tolist()]
empty.var['n_cells'] = empty.X.sum(axis=0).A1
back = sum(empty.var['n_cells'])/empty.shape[0] # Background score for the sample
#empty.var.sort_values(by = 'n_cells', ascending = False).iloc[0:50].index.tolist()
D344_Brus_Dis1 = D344_Brus_Dis1[qc_df_subsetFull.index.tolist()]
D344_Brus_Dis1.obs['background'] = back
D344_Brus_Dis1.obs['empty_droplets'] = empty.shape[0]
D344_Brus_Dis1.var['background_gene'] = empty.var['n_cells']
sc.pp.filter_cells(D344_Brus_Dis1, min_genes=0)
mito_genes = D344_Brus_Dis1.var_names.str.startswith('MT-')
D344_Brus_Dis1.obs['percent_mito'] = np.sum(
D344_Brus_Dis1[:, mito_genes].X, axis=1).A1 / np.sum(D344_Brus_Dis1.X, axis=1).A1
D344_Brus_Dis1.obs['n_counts'] = D344_Brus_Dis1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D344_Brus_Dis1.to_df())
ribo_genes = D344_Brus_Dis1.to_df().columns.isin(RB_genes_in_df)
D344_Brus_Dis1.obs['percent_ribo'] = np.sum(
D344_Brus_Dis1[:, ribo_genes].X, axis=1).A1 / np.sum(D344_Brus_Dis1.X, axis=1).A1
D344_Brus_Dis1.var['ribo_genes'] = [not i for i in ribo_genes]
sc.pl.violin(D344_Brus_Dis1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
jitter=0.4, multi_panel=True)
sc.pp.filter_cells(D344_Brus_Dis1, min_genes=500)
D344_Brus_Dis1 = D344_Brus_Dis1[D344_Brus_Dis1.obs['n_counts'] < 30000, :]
D344_Brus_Dis1 = D344_Brus_Dis1[D344_Brus_Dis1.obs['percent_mito'] < 0.3 , :]
sc.pp.normalize_per_cell(D344_Brus_Dis1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D344_Brus_Dis1) # log transform the data
D344_Brus_Dis1.raw = D344_Brus_Dis1
D344_Brus_Dis1_raw = D344_Brus_Dis1 # freeze the object (for later use of the raw state of it)
D344_Brus_Dis1 = D344_Brus_Dis1[:, D344_Brus_Dis1.var['ribo_genes']]
D344_Brus_Dis1
D353_Brus_Dis1 = sc.read_10x_mtx(
'./D353_Brus_Dis1/' + outsPath,
var_names='gene_symbols',
cache=True)
D353_Brus_Dis1.var_names_make_unique()
D353_Brus_Dis1.obs['manip'] = 'D353_Brus_Dis1'
D353_Brus_Dis1.obs['position'] = 'Distal'
D353_Brus_Dis1.obs['method'] = 'Brushing'
D353_Brus_Dis1.obs['donor'] = 'D353'
D353_Brus_Dis1.obs['name'] = ['D353_Brus_Dis1_' + s for s in list(D353_Brus_Dis1.obs.index)]
D353_Brus_Dis1.obs_names = D353_Brus_Dis1.obs['name']
D353_Brus_Dis1
filtered = sc.read_10x_mtx(
'./D353_Brus_Dis1/' + outsPathFilter,
var_names='gene_symbols',
cache=True)
minCount = min(filtered.X.sum(axis=1).A1)
D353_Brus_Dis1.obs['n_counts'] = D353_Brus_Dis1.X.sum(axis=1).A1
sc.pp.filter_cells(D353_Brus_Dis1, min_genes=0)
D353_Brus_Dis1.obs_keys
qc_df = D353_Brus_Dis1.obs[['n_genes','n_counts']]
qc_df = qc_df.sort_values(by = 'n_counts', ascending = False)
qc_df['cell_nb'] = range(0, qc_df.shape[0])
qc_df = qc_df[qc_df['n_counts'] > 0]
qc_df.shape
qc_df_subsetFull = qc_df[qc_df['n_counts'] > minCount]
print(qc_df_subsetFull.shape)
qc_df_subsetEmpty = qc_df[qc_df['n_counts'] <= 50]
qc_df_subsetEmpty = qc_df_subsetEmpty[qc_df_subsetEmpty['n_counts'] > 5]
print(qc_df_subsetEmpty.shape)
fig, ax = plt.subplots()
ax.plot(qc_df[['cell_nb']], qc_df[['n_counts']])
ax.set_xscale('log'); ax.set_yscale('log')
plt.axhline(y=10, color='r') # Limit to empty droplets
plt.axhline(y=minCount, color='k') # Limit to filtered dataset
plt.show()
empty = D353_Brus_Dis1[qc_df_subsetEmpty.index.tolist()]
empty.var['n_cells'] = empty.X.sum(axis=0).A1
back = sum(empty.var['n_cells'])/empty.shape[0] # Background score for the sample
#empty.var.sort_values(by = 'n_cells', ascending = False).iloc[0:50].index.tolist()
D353_Brus_Dis1 = D353_Brus_Dis1[qc_df_subsetFull.index.tolist()]
D353_Brus_Dis1.obs['background'] = back
D353_Brus_Dis1.obs['empty_droplets'] = empty.shape[0]
D353_Brus_Dis1.var['background_gene'] = empty.var['n_cells']
sc.pp.filter_cells(D353_Brus_Dis1, min_genes=0)
mito_genes = D353_Brus_Dis1.var_names.str.startswith('MT-')
D353_Brus_Dis1.obs['percent_mito'] = np.sum(
D353_Brus_Dis1[:, mito_genes].X, axis=1).A1 / np.sum(D353_Brus_Dis1.X, axis=1).A1
D353_Brus_Dis1.obs['n_counts'] = D353_Brus_Dis1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D353_Brus_Dis1.to_df())
ribo_genes = D353_Brus_Dis1.to_df().columns.isin(RB_genes_in_df)
D353_Brus_Dis1.obs['percent_ribo'] = np.sum(
D353_Brus_Dis1[:, ribo_genes].X, axis=1).A1 / np.sum(D353_Brus_Dis1.X, axis=1).A1
D353_Brus_Dis1.var['ribo_genes'] = [not i for i in ribo_genes]
sc.pl.violin(D353_Brus_Dis1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
jitter=0.4, multi_panel=True)
sc.pp.filter_cells(D353_Brus_Dis1, min_genes=500)
D353_Brus_Dis1 = D353_Brus_Dis1[D353_Brus_Dis1.obs['n_counts'] < 20000, :]
D353_Brus_Dis1 = D353_Brus_Dis1[D353_Brus_Dis1.obs['percent_mito'] < 0.5 , :]
sc.pp.normalize_per_cell(D353_Brus_Dis1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D353_Brus_Dis1) # log transform the data
D353_Brus_Dis1.raw = D353_Brus_Dis1
D353_Brus_Dis1_raw = D353_Brus_Dis1 # freeze the object (for later use of the raw state of it)
D353_Brus_Dis1 = D353_Brus_Dis1[:, D353_Brus_Dis1.var['ribo_genes']]
D353_Brus_Dis1
D354_Brus_Dis1 = sc.read_10x_mtx(
'./D354_Brus_Dis1/' + outsPath,
var_names='gene_symbols',
cache=True)
D354_Brus_Dis1.var_names_make_unique()
D354_Brus_Dis1.obs['manip'] = 'D354_Brus_Dis1'
D354_Brus_Dis1.obs['position'] = 'Distal'
D354_Brus_Dis1.obs['method'] = 'Brushing'
D354_Brus_Dis1.obs['donor'] = 'D354'
D354_Brus_Dis1.obs['name'] = ['D354_Brus_Dis1_' + s for s in list(D354_Brus_Dis1.obs.index)]
D354_Brus_Dis1.obs_names = D354_Brus_Dis1.obs['name']
D354_Brus_Dis1
filtered = sc.read_10x_mtx(
'./D354_Brus_Dis1/' + outsPathFilter,
var_names='gene_symbols',
cache=True)
minCount = min(filtered.X.sum(axis=1).A1)
D354_Brus_Dis1.obs['n_counts'] = D354_Brus_Dis1.X.sum(axis=1).A1
sc.pp.filter_cells(D354_Brus_Dis1, min_genes=0)
D354_Brus_Dis1.obs_keys
qc_df = D354_Brus_Dis1.obs[['n_genes','n_counts']]
qc_df = qc_df.sort_values(by = 'n_counts', ascending = False)
qc_df['cell_nb'] = range(0, qc_df.shape[0])
qc_df = qc_df[qc_df['n_counts'] > 0]
qc_df.shape
qc_df_subsetFull = qc_df[qc_df['n_counts'] > minCount]
print(qc_df_subsetFull.shape)
qc_df_subsetEmpty = qc_df[qc_df['n_counts'] <= 50]
qc_df_subsetEmpty = qc_df_subsetEmpty[qc_df_subsetEmpty['n_counts'] > 5]
print(qc_df_subsetEmpty.shape)
fig, ax = plt.subplots()
ax.plot(qc_df[['cell_nb']], qc_df[['n_counts']])
ax.set_xscale('log'); ax.set_yscale('log')
plt.axhline(y=10, color='r') # Limit to empty droplets
plt.axhline(y=minCount, color='k') # Limit to filtered dataset
plt.show()
empty = D354_Brus_Dis1[qc_df_subsetEmpty.index.tolist()]
empty.var['n_cells'] = empty.X.sum(axis=0).A1
back = sum(empty.var['n_cells'])/empty.shape[0] # Background score for the sample
#empty.var.sort_values(by = 'n_cells', ascending = False).iloc[0:50].index.tolist()
D354_Brus_Dis1 = D354_Brus_Dis1[qc_df_subsetFull.index.tolist()]
D354_Brus_Dis1.obs['background'] = back
D354_Brus_Dis1.obs['empty_droplets'] = empty.shape[0]
D354_Brus_Dis1.var['background_gene'] = empty.var['n_cells']
sc.pp.filter_cells(D354_Brus_Dis1, min_genes=0)
mito_genes = D354_Brus_Dis1.var_names.str.startswith('MT-')
D354_Brus_Dis1.obs['percent_mito'] = np.sum(
D354_Brus_Dis1[:, mito_genes].X, axis=1).A1 / np.sum(D354_Brus_Dis1.X, axis=1).A1
D354_Brus_Dis1.obs['n_counts'] = D354_Brus_Dis1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D354_Brus_Dis1.to_df())
ribo_genes = D354_Brus_Dis1.to_df().columns.isin(RB_genes_in_df)
D354_Brus_Dis1.obs['percent_ribo'] = np.sum(
D354_Brus_Dis1[:, ribo_genes].X, axis=1).A1 / np.sum(D354_Brus_Dis1.X, axis=1).A1
D354_Brus_Dis1.var['ribo_genes'] = [not i for i in ribo_genes]
sc.pl.violin(D354_Brus_Dis1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
jitter=0.4, multi_panel=True)
sc.pp.filter_cells(D354_Brus_Dis1, min_genes=500)
D354_Brus_Dis1 = D354_Brus_Dis1[D354_Brus_Dis1.obs['n_counts'] < 30000, :]
D354_Brus_Dis1 = D354_Brus_Dis1[D354_Brus_Dis1.obs['percent_mito'] < 0.5 , :]
sc.pp.normalize_per_cell(D354_Brus_Dis1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D354_Brus_Dis1) # log transform the data
D354_Brus_Dis1.raw = D354_Brus_Dis1
D354_Brus_Dis1_raw = D354_Brus_Dis1 # freeze the object (for later use of the raw state of it)
D354_Brus_Dis1 = D354_Brus_Dis1[:, D354_Brus_Dis1.var['ribo_genes']]
D354_Brus_Dis1
D363_Brus_Dis1 = sc.read_10x_mtx(
'./D363_Brus_Dis1/' + outsPath,
var_names='gene_symbols',
cache=True)
D363_Brus_Dis1.var_names_make_unique()
D363_Brus_Dis1.obs['manip'] = 'D363_Brus_Dis1'
D363_Brus_Dis1.obs['position'] = 'Distal'
D363_Brus_Dis1.obs['method'] = 'Brushing'
D363_Brus_Dis1.obs['donor'] = 'D363'
D363_Brus_Dis1.obs['name'] = ['D363_Brus_Dis1_' + s for s in list(D363_Brus_Dis1.obs.index)]
D363_Brus_Dis1.obs_names = D363_Brus_Dis1.obs['name']
D363_Brus_Dis1
filtered = sc.read_10x_mtx(
'./D363_Brus_Dis1/' + outsPathFilter,
var_names='gene_symbols',
cache=True)
minCount = min(filtered.X.sum(axis=1).A1)
D363_Brus_Dis1.obs['n_counts'] = D363_Brus_Dis1.X.sum(axis=1).A1
sc.pp.filter_cells(D363_Brus_Dis1, min_genes=0)
D363_Brus_Dis1.obs_keys
qc_df = D363_Brus_Dis1.obs[['n_genes','n_counts']]
qc_df = qc_df.sort_values(by = 'n_counts', ascending = False)
qc_df['cell_nb'] = range(0, qc_df.shape[0])
qc_df = qc_df[qc_df['n_counts'] > 0]
qc_df.shape
qc_df_subsetFull = qc_df[qc_df['n_counts'] > minCount]
print(qc_df_subsetFull.shape)
qc_df_subsetEmpty = qc_df[qc_df['n_counts'] <= 50]
qc_df_subsetEmpty = qc_df_subsetEmpty[qc_df_subsetEmpty['n_counts'] > 5]
print(qc_df_subsetEmpty.shape)
fig, ax = plt.subplots()
ax.plot(qc_df[['cell_nb']], qc_df[['n_counts']])
ax.set_xscale('log'); ax.set_yscale('log')
plt.axhline(y=10, color='r') # Limit to empty droplets
plt.axhline(y=minCount, color='k') # Limit to filtered dataset
plt.show()
empty = D363_Brus_Dis1[qc_df_subsetEmpty.index.tolist()]
empty.var['n_cells'] = empty.X.sum(axis=0).A1
back = sum(empty.var['n_cells'])/empty.shape[0] # Background score for the sample
#empty.var.sort_values(by = 'n_cells', ascending = False).iloc[0:50].index.tolist()
D363_Brus_Dis1 = D363_Brus_Dis1[qc_df_subsetFull.index.tolist()]
D363_Brus_Dis1.obs['background'] = back
D363_Brus_Dis1.obs['empty_droplets'] = empty.shape[0]
D363_Brus_Dis1.var['background_gene'] = empty.var['n_cells']
sc.pp.filter_cells(D363_Brus_Dis1, min_genes=0)
mito_genes = D363_Brus_Dis1.var_names.str.startswith('MT-')
D363_Brus_Dis1.obs['percent_mito'] = np.sum(
D363_Brus_Dis1[:, mito_genes].X, axis=1).A1 / np.sum(D363_Brus_Dis1.X, axis=1).A1
D363_Brus_Dis1.obs['n_counts'] = D363_Brus_Dis1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D363_Brus_Dis1.to_df())
ribo_genes = D363_Brus_Dis1.to_df().columns.isin(RB_genes_in_df)
D363_Brus_Dis1.obs['percent_ribo'] = np.sum(
D363_Brus_Dis1[:, ribo_genes].X, axis=1).A1 / np.sum(D363_Brus_Dis1.X, axis=1).A1
D363_Brus_Dis1.var['ribo_genes'] = [not i for i in ribo_genes]
sc.pl.violin(D363_Brus_Dis1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
jitter=0.4, multi_panel=True)
sc.pp.filter_cells(D363_Brus_Dis1, min_genes=500)
D363_Brus_Dis1 = D363_Brus_Dis1[D363_Brus_Dis1.obs['n_counts'] < 40000, :]
D363_Brus_Dis1 = D363_Brus_Dis1[D363_Brus_Dis1.obs['percent_mito'] < 0.5 , :]
sc.pp.normalize_per_cell(D363_Brus_Dis1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D363_Brus_Dis1) # log transform the data
D363_Brus_Dis1.raw = D363_Brus_Dis1
D363_Brus_Dis1_raw = D363_Brus_Dis1 # freeze the object (for later use of the raw state of it)
D363_Brus_Dis1 = D363_Brus_Dis1[:, D363_Brus_Dis1.var['ribo_genes']]
D363_Brus_Dis1
D367_Brus_Dis1 = sc.read_10x_mtx(
'./D367_Brus_Dis1/' + outsPath,
var_names='gene_symbols',
cache=True)
D367_Brus_Dis1.var_names_make_unique()
D367_Brus_Dis1.obs['manip'] = 'D367_Brus_Dis1'
D367_Brus_Dis1.obs['position'] = 'Distal'
D367_Brus_Dis1.obs['method'] = 'Brushing'
D367_Brus_Dis1.obs['donor'] = 'D367'
D367_Brus_Dis1.obs['name'] = ['D367_Brus_Dis1_' + s for s in list(D367_Brus_Dis1.obs.index)]
D367_Brus_Dis1.obs_names = D367_Brus_Dis1.obs['name']
D367_Brus_Dis1
filtered = sc.read_10x_mtx(
'./D367_Brus_Dis1/' + outsPathFilter,
var_names='gene_symbols',
cache=True)
minCount = min(filtered.X.sum(axis=1).A1)
D367_Brus_Dis1.obs['n_counts'] = D367_Brus_Dis1.X.sum(axis=1).A1
sc.pp.filter_cells(D367_Brus_Dis1, min_genes=0)
D367_Brus_Dis1.obs_keys
qc_df = D367_Brus_Dis1.obs[['n_genes','n_counts']]
qc_df = qc_df.sort_values(by = 'n_counts', ascending = False)
qc_df['cell_nb'] = range(0, qc_df.shape[0])
qc_df = qc_df[qc_df['n_counts'] > 0]
qc_df.shape
qc_df_subsetFull = qc_df[qc_df['n_counts'] > minCount]
print(qc_df_subsetFull.shape)
qc_df_subsetEmpty = qc_df[qc_df['n_counts'] <= 50]
qc_df_subsetEmpty = qc_df_subsetEmpty[qc_df_subsetEmpty['n_counts'] > 5]
print(qc_df_subsetEmpty.shape)
fig, ax = plt.subplots()
ax.plot(qc_df[['cell_nb']], qc_df[['n_counts']])
ax.set_xscale('log'); ax.set_yscale('log')
plt.axhline(y=10, color='r') # Limit to empty droplets
plt.axhline(y=minCount, color='k') # Limit to filtered dataset
plt.show()
empty = D367_Brus_Dis1[qc_df_subsetEmpty.index.tolist()]
empty.var['n_cells'] = empty.X.sum(axis=0).A1
back = sum(empty.var['n_cells'])/empty.shape[0] # Background score for the sample
#empty.var.sort_values(by = 'n_cells', ascending = False).iloc[0:50].index.tolist()
D367_Brus_Dis1 = D367_Brus_Dis1[qc_df_subsetFull.index.tolist()]
D367_Brus_Dis1.obs['background'] = back
D367_Brus_Dis1.obs['empty_droplets'] = empty.shape[0]
D367_Brus_Dis1.var['background_gene'] = empty.var['n_cells']
sc.pp.filter_cells(D367_Brus_Dis1, min_genes=0)
mito_genes = D367_Brus_Dis1.var_names.str.startswith('MT-')
D367_Brus_Dis1.obs['percent_mito'] = np.sum(
D367_Brus_Dis1[:, mito_genes].X, axis=1).A1 / np.sum(D367_Brus_Dis1.X, axis=1).A1
D367_Brus_Dis1.obs['n_counts'] = D367_Brus_Dis1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D367_Brus_Dis1.to_df())
ribo_genes = D367_Brus_Dis1.to_df().columns.isin(RB_genes_in_df)
D367_Brus_Dis1.obs['percent_ribo'] = np.sum(
D367_Brus_Dis1[:, ribo_genes].X, axis=1).A1 / np.sum(D367_Brus_Dis1.X, axis=1).A1
D367_Brus_Dis1.var['ribo_genes'] = [not i for i in ribo_genes]
sc.pl.violin(D367_Brus_Dis1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
jitter=0.4, multi_panel=True)
sc.pp.filter_cells(D367_Brus_Dis1, min_genes=500)
D367_Brus_Dis1 = D367_Brus_Dis1[D367_Brus_Dis1.obs['n_counts'] < 25000, :]
D367_Brus_Dis1 = D367_Brus_Dis1[D367_Brus_Dis1.obs['percent_mito'] < 0.5 , :]
sc.pp.normalize_per_cell(D367_Brus_Dis1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D367_Brus_Dis1) # log transform the data
D367_Brus_Dis1.raw = D367_Brus_Dis1
D367_Brus_Dis1_raw = D367_Brus_Dis1 # freeze the object (for later use of the raw state of it)
D367_Brus_Dis1 = D367_Brus_Dis1[:, D367_Brus_Dis1.var['ribo_genes']]
D367_Brus_Dis1
D372_Brus_Dis1 = sc.read_10x_mtx(
'./D372_Brus_Dis1/' + outsPath,
var_names='gene_symbols',
cache=True)
D372_Brus_Dis1.var_names_make_unique()
D372_Brus_Dis1.obs['manip'] = 'D372_Brus_Dis1'
D372_Brus_Dis1.obs['position'] = 'Distal'
D372_Brus_Dis1.obs['method'] = 'Brushing'
D372_Brus_Dis1.obs['donor'] = 'D372'
D372_Brus_Dis1.obs['name'] = ['D372_Brus_Dis1_' + s for s in list(D372_Brus_Dis1.obs.index)]
D372_Brus_Dis1.obs_names = D372_Brus_Dis1.obs['name']
D372_Brus_Dis1
filtered = sc.read_10x_mtx(
'./D372_Brus_Dis1/' + outsPathFilter,
var_names='gene_symbols',
cache=True)
minCount = min(filtered.X.sum(axis=1).A1)
D372_Brus_Dis1.obs['n_counts'] = D372_Brus_Dis1.X.sum(axis=1).A1
sc.pp.filter_cells(D372_Brus_Dis1, min_genes=0)
D372_Brus_Dis1.obs_keys
qc_df = D372_Brus_Dis1.obs[['n_genes','n_counts']]
qc_df = qc_df.sort_values(by = 'n_counts', ascending = False)
qc_df['cell_nb'] = range(0, qc_df.shape[0])
qc_df = qc_df[qc_df['n_counts'] > 0]
qc_df.shape
qc_df_subsetFull = qc_df[qc_df['n_counts'] > minCount]
print(qc_df_subsetFull.shape)
qc_df_subsetEmpty = qc_df[qc_df['n_counts'] <= 50]
qc_df_subsetEmpty = qc_df_subsetEmpty[qc_df_subsetEmpty['n_counts'] > 5]
print(qc_df_subsetEmpty.shape)
fig, ax = plt.subplots()
ax.plot(qc_df[['cell_nb']], qc_df[['n_counts']])
ax.set_xscale('log'); ax.set_yscale('log')
plt.axhline(y=10, color='r') # Limit to empty droplets
plt.axhline(y=minCount, color='k') # Limit to filtered dataset
plt.show()
empty = D372_Brus_Dis1[qc_df_subsetEmpty.index.tolist()]
empty.var['n_cells'] = empty.X.sum(axis=0).A1
back = sum(empty.var['n_cells'])/empty.shape[0] # Background score for the sample
#empty.var.sort_values(by = 'n_cells', ascending = False).iloc[0:50].index.tolist()
D372_Brus_Dis1 = D372_Brus_Dis1[qc_df_subsetFull.index.tolist()]
D372_Brus_Dis1.obs['background'] = back
D372_Brus_Dis1.obs['empty_droplets'] = empty.shape[0]
D372_Brus_Dis1.var['background_gene'] = empty.var['n_cells']
sc.pp.filter_cells(D372_Brus_Dis1, min_genes=0)
mito_genes = D372_Brus_Dis1.var_names.str.startswith('MT-')
D372_Brus_Dis1.obs['percent_mito'] = np.sum(
D372_Brus_Dis1[:, mito_genes].X, axis=1).A1 / np.sum(D372_Brus_Dis1.X, axis=1).A1
D372_Brus_Dis1.obs['n_counts'] = D372_Brus_Dis1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D372_Brus_Dis1.to_df())
ribo_genes = D372_Brus_Dis1.to_df().columns.isin(RB_genes_in_df)
D372_Brus_Dis1.obs['percent_ribo'] = np.sum(
D372_Brus_Dis1[:, ribo_genes].X, axis=1).A1 / np.sum(D372_Brus_Dis1.X, axis=1).A1
D372_Brus_Dis1.var['ribo_genes'] = [not i for i in ribo_genes]
sc.pl.violin(D372_Brus_Dis1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
jitter=0.4, multi_panel=True)
sc.pp.filter_cells(D372_Brus_Dis1, min_genes=500)
D372_Brus_Dis1 = D372_Brus_Dis1[D372_Brus_Dis1.obs['n_counts'] < 30000, :]
D372_Brus_Dis1 = D372_Brus_Dis1[D372_Brus_Dis1.obs['percent_mito'] < 0.5 , :]
sc.pp.normalize_per_cell(D372_Brus_Dis1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D372_Brus_Dis1) # log transform the data
D372_Brus_Dis1.raw = D372_Brus_Dis1
D372_Brus_Dis1_raw = D372_Brus_Dis1 # freeze the object (for later use of the raw state of it)
D372_Brus_Dis1 = D372_Brus_Dis1[:, D372_Brus_Dis1.var['ribo_genes']]
D372_Brus_Dis1
adata = D322_Biop_Nas1.concatenate(D322_Biop_Pro1, D322_Biop_Int1,
D326_Biop_Pro1, D326_Biop_Int1, D326_Brus_Dis1,
D337_Brus_Dis1,
D339_Biop_Nas1, D339_Biop_Pro1, D339_Biop_Int1, D339_Brus_Dis1,
D344_Biop_Nas1, D344_Biop_Pro1, D344_Biop_Int1, D344_Brus_Dis1,
D345_Biop_Nas1,
D353_Brus_Nas1, D353_Biop_Pro1, D353_Biop_Int2, D353_Brus_Dis1,
D354_Biop_Pro1, D354_Biop_Int2, D354_Brus_Dis1,
D363_Brus_Nas1, D363_Biop_Pro1, D363_Biop_Int2, D363_Brus_Dis1,
D367_Brus_Nas1, D367_Biop_Pro1, D367_Biop_Int1, D367_Brus_Dis1,
D372_Brus_Nas1, D372_Biop_Pro1, D372_Biop_Int1, D372_Biop_Int2, D372_Brus_Dis1,
join='inner')
adata.obs.to_csv(path_or_buf = '/Data/background_metadata.tsv',
sep = '\t')
adata.var.to_csv(path_or_buf = '/Data/background_features.tsv',
sep = '\t')
adata.obs[['manip','background', 'position']].drop_duplicates().sort_values(by = 'position')